InĀ [1]:
import pandas as pd
import numpy as np
from scipy.stats import chi2_contingency
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
from sklearn.model_selection import train_test_split
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
warnings.filterwarnings('ignore')
InĀ [2]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

Dataset Creation¶

InĀ [5]:
# nhanes target
nhanes = pd.read_csv("/Users/kevinnguyen/Downloads/nhanes_base_target_final.csv", usecols=range(1, len(pd.read_csv("/Users/kevinnguyen/Downloads/nhanes_base_target_final.csv").columns)))
InĀ [7]:
# demographics
demographics_0506 = pd.read_sas("/Users/kevinnguyen/Downloads/nhanes_0506/nhanes_d_DEMO_D_0506.xpt")
demographics_0708 = pd.read_sas("/Users/kevinnguyen/Downloads/nhanes_0708/nhanes_d_DEMO_E_0708.xpt")
demographics_0910 = pd.read_sas("/Users/kevinnguyen/Downloads/nahnes_0910/nhanes_d_DEMO_F_0910.xpt")
demographics_1112 = pd.read_sas("/Users/kevinnguyen/Downloads/nhanes_1112/nhanes_d_DEMO_G_1112.xpt")
demographics_1314 = pd.read_sas("/Users/kevinnguyen/Downloads/nhanes_1314/nhanes_d_DEMO_H_1314.xpt")
demographics_1516 = pd.read_sas("/Users/kevinnguyen/Downloads/nhanes_1516/nhanes_d_DEMO_H_1516.xpt")
InĀ [9]:
# 0506 column name for income is INDHHINC --> change to INDHHIN2 to match others
demographics_0506.rename(columns = {"INDHHINC": "INDHHIN2"}, inplace = True)
demographics_colnames = ["SEQN", "RIAGENDR", "RIDAGEYR", "RIDRETH1", "DMDEDUC2", "INDHHIN2"]
InĀ [11]:
# dietary
dietary_0506 = pd.read_sas("/Users/kevinnguyen/Downloads/nhanes_0506/nhanes_dietary_DR1TOT_D_0506.xpt")
dietary_0708 = pd.read_sas("/Users/kevinnguyen/Downloads/nhanes_0708/nhanes_dietary_DR1TOT_E_0708.xpt")
dietary_0910 = pd.read_sas("/Users/kevinnguyen/Downloads/nahnes_0910/nhanes_dietary_DR1TOT_F_0910.xpt")
dietary_1112 = pd.read_sas("/Users/kevinnguyen/Downloads/nhanes_1112/nhanes_dietary_DR1TOT_G_1112.xpt")
dietary_1314 = pd.read_sas("/Users/kevinnguyen/Downloads/nhanes_1314/nhanes_dietary_DR1TOT_H_1314.xpt")
dietary_1516 = pd.read_sas("/Users/kevinnguyen/Downloads/nhanes_1516/nhanes_dietary_DR1TOT_I_1516.xpt")
InĀ [12]:
dietary_colnames = ["SEQN", "DR1TKCAL", "DR1TCARB", "DR1TSUGR", "DR1TTFAT", "DR1TFIBE", "DR1_320Z"]
InĀ [15]:
# examinations
examination_0506 = pd.read_sas("/Users/kevinnguyen/Downloads/nhanes_0506/nhanes_e_BMX_D_0506.xpt")
examination_0708 = pd.read_sas("/Users/kevinnguyen/Downloads/nhanes_0708/nhanes_e_BMX_E_0708.xpt")
examination_0910 = pd.read_sas("/Users/kevinnguyen/Downloads/nahnes_0910/nhanes_e_BMX_F_0910.xpt")
examination_1112 = pd.read_sas("/Users/kevinnguyen/Downloads/nhanes_1112/nhanes_e_BMX_G_1112.xpt")
examination_1314 = pd.read_sas("/Users/kevinnguyen/Downloads/nhanes_1314/nhanes_e_BMX_H_1314.xpt")
examination_1516 = pd.read_sas("/Users/kevinnguyen/Downloads/nhanes_1516/nhanes_e_BMX_H_1516.xpt")
InĀ [17]:
examination_colnames = ["SEQN", "BMXBMI"]
InĀ [19]:
# laboratory
## metals
CdHgPb_0506 = pd.read_sas("/Users/kevinnguyen/Downloads/nhanes_0506/nhanes_l_PBCD_D_0506.xpt")
CdHgPb_0708 = pd.read_sas("/Users/kevinnguyen/Downloads/nhanes_0708/nhanes_l_PBCD_E_0708.xpt")
CdHgPb_0910 = pd.read_sas("/Users/kevinnguyen/Downloads/nahnes_0910/nhanes_l_PBCD_F_0910.xpt")
CdHgPb_1112 = pd.read_sas("/Users/kevinnguyen/Downloads/nhanes_1112/nhanes_l_PBCD_G_1112.xpt")
CdHgPb_1314 = pd.read_sas("/Users/kevinnguyen/Downloads/nhanes_1314/nhanes_l_PBCD_H_1314.xpt")
CdHgPb_1516 = pd.read_sas("/Users/kevinnguyen/Downloads/nhanes_1516/nhanes_l_PBCD_I_1516.xpt")
arsenic_0506 = pd.read_sas("/Users/kevinnguyen/Downloads/nhanes_0506/nhanes_l_UAS_D_0506.xpt")
arsenic_0708 = pd.read_sas("/Users/kevinnguyen/Downloads/nhanes_0708/nhanes_l_UAS_E_0708.xpt")
arsenic_0910 = pd.read_sas("/Users/kevinnguyen/Downloads/nahnes_0910/nhanes_l_UAS_F_0910.xpt")
arsenic_1112 = pd.read_sas("/Users/kevinnguyen/Downloads/nhanes_1112/nhanes_l_UAS_G_1112.xpt")
arsenic_1314 = pd.read_sas("/Users/kevinnguyen/Downloads/nhanes_1314/nhanes_l_UTAS_H_1314.xpt")
arsenic_1516 = pd.read_sas("/Users/kevinnguyen/Downloads/nhanes_1516/nhanes_l_UTAS_I_1516.xpt")

## glucose & insulin
glu_0506 = pd.read_sas("/Users/kevinnguyen/Downloads/nhanes_0506/nhanes_l_GLU_D_0506.xpt")
glu_0708 = pd.read_sas("/Users/kevinnguyen/Downloads/nhanes_0708/nhanes_l_GLU_E_0708.xpt")
glu_0910 = pd.read_sas("/Users/kevinnguyen/Downloads/nahnes_0910/nhanes_l_GLU_F_0910.xpt")
glu_1112 = pd.read_sas("/Users/kevinnguyen/Downloads/nhanes_1112/nhanes_l_GLU_G_1112.xpt")
glu_1314 = pd.read_sas("/Users/kevinnguyen/Downloads/nhanes_1314/nhanes_l_GLU_H_1314.xpt")
glu_1516 = pd.read_sas("/Users/kevinnguyen/Downloads/nhanes_1516/nhanes_l_GLU_I_1516.xpt")
ins_1314 = pd.read_sas("/Users/kevinnguyen/Downloads/nhanes_1314/nhanes_l_INS_H_1314.xpt")
ins_1516 = pd.read_sas("/Users/kevinnguyen/Downloads/nhanes_1516/nhanes_l_INS_I_1516.xpt")

## pfas
pfa_0506 = pd.read_sas("/Users/kevinnguyen/Downloads/nhanes_0506/nhanes_l_PFC_D_0506.xpt")
pfa_0708 = pd.read_sas("/Users/kevinnguyen/Downloads/nhanes_0708/nhanes_l_PFC_E_0708.xpt")
pfa_0910 = pd.read_sas("/Users/kevinnguyen/Downloads/nahnes_0910/nhanes_l_PFC_F_0910.xpt")
pfa_1112 = pd.read_sas("/Users/kevinnguyen/Downloads/nhanes_1112/nhanes_l_PFC_G_1112.xpt")
pfa_1314 = pd.read_sas("/Users/kevinnguyen/Downloads/nhanes_1314/nhanes_l_PFAS_H_1314.xpt")
pfa_1516 = pd.read_sas("/Users/kevinnguyen/Downloads/nhanes_1516/nhanes_l_PFAS_I_1516.xpt")

## pahs
pah_0506 = pd.read_sas("/Users/kevinnguyen/Downloads/nhanes_0506/nhanes_l_PAH_D_0506.xpt")
pah_0708 = pd.read_sas("/Users/kevinnguyen/Downloads/nhanes_0708/nhanes_l_PAH_E_0708.xpt")
pah_0910 = pd.read_sas("/Users/kevinnguyen/Downloads/nahnes_0910/nhanes_l_PAH_F_0910.xpt")
pah_1112 = pd.read_sas("/Users/kevinnguyen/Downloads/nhanes_1112/nhanes_l_PAH_G_1112.xpt")
pah_1314 = pd.read_sas("/Users/kevinnguyen/Downloads/nhanes_1314/nhanes_l_PAH_H_1314.xpt")
pah_1516 = pd.read_sas("/Users/kevinnguyen/Downloads/nhanes_1516/nhanes_l_PAH_I_1516.xpt")

## environmental phalates
eph_0506 = pd.read_sas("/Users/kevinnguyen/Downloads/nhanes_0506/nhanes_l_EPH_D_0506.xpt")
eph_0708 = pd.read_sas("/Users/kevinnguyen/Downloads/nhanes_0708/nhanes_l_EPH_E_0708.xpt")
eph_0910 = pd.read_sas("/Users/kevinnguyen/Downloads/nahnes_0910/nhanes_l_EPH_F_0910.xpt")
eph_1112 = pd.read_sas("/Users/kevinnguyen/Downloads/nhanes_1112/nhanes_l_EPH_G_1112.xpt")
eph_1314 = pd.read_sas("/Users/kevinnguyen/Downloads/nhanes_1314/nhanes_l_EPHPP_H_1314.xpt")
eph_1516 = pd.read_sas("/Users/kevinnguyen/Downloads/nhanes_1516/nhanes_l_EPHPP_I_1516.xpt")

## phthalates
phthalate_0506 = pd.read_sas("/Users/kevinnguyen/Downloads/nhanes_0506/nhanes_l_PHTHTE_D_0506.xpt")
phthalate_0708 = pd.read_sas("/Users/kevinnguyen/Downloads/nhanes_0708/nhanes_l_PHTHTE_E_0708.xpt")
phthalate_0910 = pd.read_sas("/Users/kevinnguyen/Downloads/nahnes_0910/nhanes_l_PHTHTE_F_0910.xpt")
phthalate_1112 = pd.read_sas("/Users/kevinnguyen/Downloads/nhanes_1112/nhanes_l_PHTHTE_G_1112.xpt")
phthalate_1314 = pd.read_sas("/Users/kevinnguyen/Downloads/nhanes_1314/nhanes_l_PHTHTE_H_1314.xpt")
phthalate_1516 = pd.read_sas("/Users/kevinnguyen/Downloads/nhanes_1516/nhanes_l_PHTHTE_I_1516.xpt")
InĀ [21]:
# combining ins dataset with glu for 1314 and 1516
ins_1314_new = ins_1314[["SEQN", "LBXIN"]]
ins_1516_new = ins_1516[["SEQN", "LBXIN"]]
glu_1314 = pd.merge(glu_1314, ins_1314_new, on='SEQN', how='left')
glu_1516 = pd.merge(glu_1516, ins_1516_new, on='SEQN', how='left')

glucose_colnames = ["SEQN", "LBXGLU", "LBXIN"]
CdHgPb_colnames = ["SEQN", "LBXBCD", "LBXBPB", "LBXTHG"]
arsenic_colnames = ["SEQN", "URXUAS"]
pfa_colnames = ["SEQN", "LBXMPAH", "LBXPFDO", "LBXPFNA", "LBXPFHS", "LBXPFDE", "LBXPFUA"]
pah_colnames = ["SEQN", "URXP01", "URXP02", "URXP03", "URXP04", "URXP06"]
eph_colnames = ["SEQN", "URXBPH", "URXTRS", "URXBP3", "URXBUP", "URXEPB", "URXMPB", "URXPPB"]
phthalate_colnames = ["SEQN", "URXCNP", "URXCOP", "URXECP", "URXMBP", "URXMC1", "URXMEP", "URXMHH", "URXMHP", "URXMIB", "URXMNP", "URXMOH", "URXMZP"]
InĀ [23]:
# questionnaires
## alcohol
alq_0506 = pd.read_sas("/Users/kevinnguyen/Downloads/nhanes_0506/nhanes_q_ALQ_D_0506.xpt")
alq_0708 = pd.read_sas("/Users/kevinnguyen/Downloads/nhanes_0708/nhanes_q_ALQ_E_0708.xpt")
alq_0910 = pd.read_sas("/Users/kevinnguyen/Downloads/nahnes_0910/nhanes_q_ALQ_F_0910.xpt")
alq_1112 = pd.read_sas("/Users/kevinnguyen/Downloads/nhanes_1112/nhanes_q_ALQ_G_1112.xpt")
alq_1314 = pd.read_sas("/Users/kevinnguyen/Downloads/nhanes_1314/nhanes_q_ALQ_H_1314.xpt")
alq_1516 = pd.read_sas("/Users/kevinnguyen/Downloads/nhanes_1516/nhanes_q_ALQ_I_1516.xpt")

## physical activity
paq_0506 = pd.read_sas("/Users/kevinnguyen/Downloads/nhanes_0506/nhanes_q_PAQ_D_0506.xpt")
paq_0708 = pd.read_sas("/Users/kevinnguyen/Downloads/nhanes_0708/nhanes_q_PAQ_E_0708.xpt")
paq_0910 = pd.read_sas("/Users/kevinnguyen/Downloads/nahnes_0910/nhanes_q_PAQ_F_0910.xpt")
paq_1112 = pd.read_sas("/Users/kevinnguyen/Downloads/nhanes_1112/nhanes_q_PAQ_G_1112.xpt")
paq_1314 = pd.read_sas("/Users/kevinnguyen/Downloads/nhanes_1314/nhanes_q_PAQ_H_1314.xpt")
paq_1516 = pd.read_sas("/Users/kevinnguyen/Downloads/nhanes_1516/nhanes_q_PAQ_I_1516.xpt")

## smoking
smq_0506 = pd.read_sas("/Users/kevinnguyen/Downloads/nhanes_0506/nhanes_q_SMQ_D_0506.xpt")
smq_0708 = pd.read_sas("/Users/kevinnguyen/Downloads/nhanes_0708/nhanes_q_SMQ_E_0708.xpt")
smq_0910 = pd.read_sas("/Users/kevinnguyen/Downloads/nahnes_0910/nhanes_q_SMQ_F_0910.xpt")
smq_1112 = pd.read_sas("/Users/kevinnguyen/Downloads/nhanes_1112/nhanes_q_SMQ_G_1112.xpt")
smq_1314 = pd.read_sas("/Users/kevinnguyen/Downloads/nhanes_1314/nhanes_q_SMQ_H_1314.xpt")
smq_1516 = pd.read_sas("/Users/kevinnguyen/Downloads/nhanes_1516/nhanes_q_SMQ_I_1516.xpt")

## mental health
dpq_0506 = pd.read_sas("/Users/kevinnguyen/Downloads/nhanes_0506/nhanes_q_DPQ_D_0506.xpt")
dpq_0708 = pd.read_sas("/Users/kevinnguyen/Downloads/nhanes_0708/nhanes_q_DPQ_E_0708.xpt")
dpq_0910 = pd.read_sas("/Users/kevinnguyen/Downloads/nahnes_0910/nhanes_q_DPQ_F_0910.xpt")
dpq_1112 = pd.read_sas("/Users/kevinnguyen/Downloads/nhanes_1112/nhanes_q_DPQ_G_1112.xpt")
dpq_1314 = pd.read_sas("/Users/kevinnguyen/Downloads/nhanes_1314/nhanes_q_DPQ_H_1314.xpt")
dpq_1516 = pd.read_sas("/Users/kevinnguyen/Downloads/nhanes_1516/nhanes_q_DPQ_I_1516.xpt")

## sleep disorders
slq_0506 = pd.read_sas("/Users/kevinnguyen/Downloads/nhanes_0506/nhanes_q_SLQ_D_0506.xpt")
slq_0708 = pd.read_sas("/Users/kevinnguyen/Downloads/nhanes_0708/nhanes_q_SLQ_E_0708.xpt")
slq_0910 = pd.read_sas("/Users/kevinnguyen/Downloads/nahnes_0910/nhanes_q_SLQ_F_0910.xpt")
slq_1112 = pd.read_sas("/Users/kevinnguyen/Downloads/nhanes_1112/nhanes_q_SLQ_G_1112.xpt")
slq_1314 = pd.read_sas("/Users/kevinnguyen/Downloads/nhanes_1314/nhanes_q_SLQ_H_1314.xpt")
slq_1516 = pd.read_sas("/Users/kevinnguyen/Downloads/nhanes_1516/nhanes_q_SLQ_I_1516.xpt")

## medical conditions
mcq_0506 = pd.read_sas("/Users/kevinnguyen/Downloads/nhanes_0506/nhanes_q_MCQ_D_0506.xpt")
mcq_0708 = pd.read_sas("/Users/kevinnguyen/Downloads/nhanes_0708/nhanes_q_MCQ_E_0708.xpt")
mcq_0910 = pd.read_sas("/Users/kevinnguyen/Downloads/nahnes_0910/nhanes_q_MCQ_F_0910.xpt")
mcq_1112 = pd.read_sas("/Users/kevinnguyen/Downloads/nhanes_1112/nhanes_q_MCQ_G_1112.xpt")
mcq_1314 = pd.read_sas("/Users/kevinnguyen/Downloads/nhanes_1314/nhanes_q_MCQ_H_1314.xpt")
mcq_1516 = pd.read_sas("/Users/kevinnguyen/Downloads/nhanes_1516/nhanes_q_MCQ_I_1516.xpt")

## hospital utilization & access to care
huq_0506 = pd.read_sas("/Users/kevinnguyen/Downloads/nhanes_0506/nhanes_q_HUQ_D_0506.xpt")
huq_0708 = pd.read_sas("/Users/kevinnguyen/Downloads/nhanes_0708/nhanes_q_HUQ_E_0708.xpt")
huq_0910 = pd.read_sas("/Users/kevinnguyen/Downloads/nahnes_0910/nhanes_q_HUQ_F_0910.xpt")
huq_1112 = pd.read_sas("/Users/kevinnguyen/Downloads/nhanes_1112/nhanes_q_HUQ_G_1112.xpt")
huq_1314 = pd.read_sas("/Users/kevinnguyen/Downloads/nhanes_1314/nhanes_q_HUQ_H_1314.xpt")
huq_1516 = pd.read_sas("/Users/kevinnguyen/Downloads/nhanes_1516/nhanes_q_HUQ_I_1516.xpt")

## health insurance
hiq_0506 = pd.read_sas("/Users/kevinnguyen/Downloads/nhanes_0506/nhanes_q_HIQ_D_0506.xpt")
hiq_0708 = pd.read_sas("/Users/kevinnguyen/Downloads/nhanes_0708/nhanes_q_HIQ_E_0708.xpt")
hiq_0910 = pd.read_sas("/Users/kevinnguyen/Downloads/nahnes_0910/nhanes_q_HIQ_F_0910.xpt")
hiq_1112 = pd.read_sas("/Users/kevinnguyen/Downloads/nhanes_1112/nhanes_q_HIQ_G_1112.xpt")
hiq_1314 = pd.read_sas("/Users/kevinnguyen/Downloads/nhanes_1314/nhanes_q_HIQ_H_1314.xpt")
hiq_1516 = pd.read_sas("/Users/kevinnguyen/Downloads/nhanes_1516/nhanes_q_HIQ_I_1516.xpt")

## housing characteristics
hoq_0506 = pd.read_sas("/Users/kevinnguyen/Downloads/nhanes_0506/nhanes_q_HOQ_D_0506.xpt")
hoq_0708 = pd.read_sas("/Users/kevinnguyen/Downloads/nhanes_0708/nhanes_q_HOQ_E_0708.xpt")
hoq_0910 = pd.read_sas("/Users/kevinnguyen/Downloads/nahnes_0910/nhanes_q_HOQ_F_0910.xpt")
hoq_1112 = pd.read_sas("/Users/kevinnguyen/Downloads/nhanes_1112/nhanes_q_HOQ_G_1112.xpt")
hoq_1314 = pd.read_sas("/Users/kevinnguyen/Downloads/nhanes_1314/nhanes_q_HOQ_H_1314.xpt")
hoq_1516 = pd.read_sas("/Users/kevinnguyen/Downloads/nhanes_1516/nhanes_q_HOQ_I_1516.xpt")
InĀ [24]:
alq_colnames = ["SEQN", "ALQ130"]
# 0506, 0708, 0910 column name for # hours watch TV or videos past 30 days is PAD590 --> change to PAQ710 to match the rest
paq_0506.rename(columns = {"PAD590": "PAQ710"}, inplace = True)
paq_0708.rename(columns = {"PAD590": "PAQ710"}, inplace = True)
paq_0910.rename(columns = {"PAD590": "PAQ710"}, inplace = True)
paq_colnames = ["SEQN", "PAQ710"]
smq_colnames = ["SEQN", "SMQ020"]
dpq_colnames = ["SEQN", "DPQ010"]
slq_colnames = ["SEQN", "SLQ050"]
mcq_colnames = ["SEQN", "MCQ300C"]
# 0506, 0708, 0910, 1112 column name for # times received healthcare over past year is HUQ050 --> change to HUQ051 to match 1516
huq_0506.rename(columns = {"HUQ050": "HUQ051"}, inplace = True)
huq_0708.rename(columns = {"HUQ050": "HUQ051"}, inplace = True)
huq_0910.rename(columns = {"HUQ050": "HUQ051"}, inplace = True)
huq_1112.rename(columns = {"HUQ050": "HUQ051"}, inplace = True)
huq_colnames = ["SEQN", "HUQ051"]
hiq_colnames = ["SEQN", "HIQ011"]
hoq_colnames = ["SEQN", "HOQ065"]
InĀ [27]:
# cleaning dataframes
## demographics
demographics_0506 = demographics_0506[demographics_colnames]
demographics_0708 = demographics_0708[demographics_colnames]
demographics_0910 = demographics_0910[demographics_colnames]
demographics_1112 = demographics_1112[demographics_colnames]
demographics_1314 = demographics_1314[demographics_colnames]
demographics_1516 = demographics_1516[demographics_colnames]

demographics = pd.concat([demographics_0506, demographics_0708, demographics_0910, demographics_1112, demographics_1314, demographics_1516], ignore_index=True)

## dietary
dietary_0506 = dietary_0506[dietary_colnames]
dietary_0708 = dietary_0708[dietary_colnames]
dietary_0910 = dietary_0910[dietary_colnames]
dietary_1112 = dietary_1112[dietary_colnames]
dietary_1314 = dietary_1314[dietary_colnames]
dietary_1516 = dietary_1516[dietary_colnames]

dietary = pd.concat([dietary_0506, dietary_0708, dietary_0910, dietary_1112, dietary_1314, dietary_1516], ignore_index=True)

## examinations
examination_0506 = examination_0506[examination_colnames]
examination_0708 = examination_0708[examination_colnames]
examination_0910 = examination_0910[examination_colnames]
examination_1112 = examination_1112[examination_colnames]
examination_1314 = examination_1314[examination_colnames]
examination_1516 = examination_1516[examination_colnames]

examinations = pd.concat([examination_0506, examination_0708, examination_0910, examination_1112, examination_1314, examination_1516], ignore_index=True)

## metals
CdHgPb_0506 = CdHgPb_0506[CdHgPb_colnames]
CdHgPb_0708 = CdHgPb_0708[CdHgPb_colnames]
CdHgPb_0910 = CdHgPb_0910[CdHgPb_colnames]
CdHgPb_1112 = CdHgPb_1112[CdHgPb_colnames]
CdHgPb_1314 = CdHgPb_1314[CdHgPb_colnames]
CdHgPb_1516 = CdHgPb_1516[CdHgPb_colnames]
arsenic_0506 = arsenic_0506[arsenic_colnames]
arsenic_0708 = arsenic_0708[arsenic_colnames]
arsenic_0910 = arsenic_0910[arsenic_colnames]
arsenic_1112 = arsenic_1112[arsenic_colnames]
arsenic_1314 = arsenic_1314[arsenic_colnames]
arsenic_1516 = arsenic_1516[arsenic_colnames]

CdHgPb = pd.concat([CdHgPb_0506, CdHgPb_0708, CdHgPb_0910, CdHgPb_1112, CdHgPb_1314, CdHgPb_1516], ignore_index=True)
arsenic = pd.concat([arsenic_0506, arsenic_0708, arsenic_0910, arsenic_1112, arsenic_1314, arsenic_1516], ignore_index=True)

## glucose & insulin
glu_0506 = glu_0506[glucose_colnames]
glu_0708 = glu_0708[glucose_colnames]
glu_0910 = glu_0910[glucose_colnames]
glu_1112 = glu_1112[glucose_colnames]
glu_1314 = glu_1314[glucose_colnames]
glu_1516 = glu_1516[glucose_colnames]

glucose = pd.concat([glu_0506, glu_0708, glu_0910, glu_1112, glu_1314, glu_1516], ignore_index=True)

## pfas
pfa_0506 = pfa_0506[pfa_colnames]
pfa_0708 = pfa_0708[pfa_colnames]
pfa_0910 = pfa_0910[pfa_colnames]
pfa_1112 = pfa_1112[pfa_colnames]
pfa_1314 = pfa_1314[pfa_colnames]
pfa_1516 = pfa_1516[pfa_colnames]

pfa = pd.concat([pfa_0506, pfa_0708, pfa_0910, pfa_1112, pfa_1314, pfa_1516], ignore_index=True)

## pahs
pah_0506 = pah_0506[pah_colnames]
pah_0708 = pah_0708[pah_colnames]
pah_0910 = pah_0910[pah_colnames]
pah_1112 = pah_1112[pah_colnames]
pah_1314 = pah_1314[pah_colnames]
pah_1516 = pah_1516[pah_colnames]

pah = pd.concat([pah_0506, pah_0708, pah_0910, pah_1112, pah_1314, pah_1516], ignore_index=True)

## environmental phalates
eph_0506 = eph_0506[eph_colnames]
eph_0708 = eph_0708[eph_colnames]
eph_0910 = eph_0910[eph_colnames]
eph_1112 = eph_1112[eph_colnames]
eph_1314 = eph_1314[eph_colnames]
eph_1516 = eph_1516[eph_colnames]

eph = pd.concat([eph_0506, eph_0708, eph_0910, eph_1112, eph_1314, eph_1516], ignore_index=True)

## phthalates
phthalate_0506 = phthalate_0506[phthalate_colnames]
phthalate_0708 = phthalate_0708[phthalate_colnames]
phthalate_0910 = phthalate_0910[phthalate_colnames]
phthalate_1112 = phthalate_1112[phthalate_colnames]
phthalate_1314 = phthalate_1314[phthalate_colnames]
phthalate_1516 = phthalate_1516[phthalate_colnames]

phthalate = pd.concat([phthalate_0506, phthalate_0708, phthalate_0910, phthalate_1112, phthalate_1314, phthalate_1516], ignore_index=True)

## alcohol
alq_0506 = alq_0506[alq_colnames]
alq_0708 = alq_0708[alq_colnames]
alq_0910 = alq_0910[alq_colnames]
alq_1112 = alq_1112[alq_colnames]
alq_1314 = alq_1314[alq_colnames]
alq_1516 = alq_1516[alq_colnames]

alq = pd.concat([alq_0506, alq_0708, alq_0910, alq_1112, alq_1314, alq_1516], ignore_index=True)

## physical activity
paq_0506 = paq_0506[paq_colnames]
paq_0708 = paq_0708[paq_colnames]
paq_0910 = paq_0910[paq_colnames]
paq_1112 = paq_1112[paq_colnames]
paq_1314 = paq_1314[paq_colnames]
paq_1516 = paq_1516[paq_colnames]

paq = pd.concat([paq_0506, paq_0708, paq_0910, paq_1112, paq_1314, paq_1516], ignore_index=True)

## smoking
smq_0506 = smq_0506[smq_colnames]
smq_0708 = smq_0708[smq_colnames]
smq_0910 = smq_0910[smq_colnames]
smq_1112 = smq_1112[smq_colnames]
smq_1314 = smq_1314[smq_colnames]
smq_1516 = smq_1516[smq_colnames]

smq = pd.concat([smq_0506, smq_0708, smq_0910, smq_1112, smq_1314, smq_1516], ignore_index=True)

## mental health
dpq_0506 = dpq_0506[dpq_colnames]
dpq_0708 = dpq_0708[dpq_colnames]
dpq_0910 = dpq_0910[dpq_colnames]
dpq_1112 = dpq_1112[dpq_colnames]
dpq_1314 = dpq_1314[dpq_colnames]
dpq_1516 = dpq_1516[dpq_colnames]

dpq = pd.concat([dpq_0506, dpq_0708, dpq_0910, dpq_1112, dpq_1314, dpq_1516], ignore_index=True)

## sleep disorders
slq_0506 = slq_0506[slq_colnames]
slq_0708 = slq_0708[slq_colnames]
slq_0910 = slq_0910[slq_colnames]
slq_1112 = slq_1112[slq_colnames]
slq_1314 = slq_1314[slq_colnames]
slq_1516 = slq_1516[slq_colnames]

slq = pd.concat([slq_0506, slq_0708, slq_0910, slq_1112, slq_1314, slq_1516], ignore_index=True)

## medical conditions
mcq_0506 = mcq_0506[mcq_colnames]
mcq_0708 = mcq_0708[mcq_colnames]
mcq_0910 = mcq_0910[mcq_colnames]
mcq_1112 = mcq_1112[mcq_colnames]
mcq_1314 = mcq_1314[mcq_colnames]
mcq_1516 = mcq_1516[mcq_colnames]

mcq = pd.concat([mcq_0506, mcq_0708, mcq_0910, mcq_1112, mcq_1314, mcq_1516], ignore_index=True)

## hospital utilization & access to care
huq_0506 = huq_0506[huq_colnames]
huq_0708 = huq_0708[huq_colnames]
huq_0910 = huq_0910[huq_colnames]
huq_1112 = huq_1112[huq_colnames]
huq_1314 = huq_1314[huq_colnames]
huq_1516 = huq_1516[huq_colnames]

huq = pd.concat([huq_0506, huq_0708, huq_0910, huq_1112, huq_1314, huq_1516], ignore_index=True)

## health insurance
hiq_0506 = hiq_0506[hiq_colnames]
hiq_0708 = hiq_0708[hiq_colnames]
hiq_0910 = hiq_0910[hiq_colnames]
hiq_1112 = hiq_1112[hiq_colnames]
hiq_1314 = hiq_1314[hiq_colnames]
hiq_1516 = hiq_1516[hiq_colnames]

hiq = pd.concat([hiq_0506, hiq_0708, hiq_0910, hiq_1112, hiq_1314, hiq_1516], ignore_index=True)

## housing characteristics
hoq_0506 = hoq_0506[hoq_colnames]
hoq_0708 = hoq_0708[hoq_colnames]
hoq_0910 = hoq_0910[hoq_colnames]
hoq_1112 = hoq_1112[hoq_colnames]
hoq_1314 = hoq_1314[hoq_colnames]
hoq_1516 = hoq_1516[hoq_colnames]

hoq = pd.concat([hoq_0506, hoq_0708, hoq_0910, hoq_1112, hoq_1314, hoq_1516], ignore_index=True)
InĀ [29]:
# merging datasets
dfs = [demographics, dietary, examinations, CdHgPb, arsenic, glucose, pfa, pah, eph, phthalate, alq, paq, smq, dpq, slq, mcq, huq, hiq, hoq]
nhanes_merged = nhanes
for df in dfs:
    nhanes_merged = pd.merge(nhanes_merged, df, on = "SEQN", how = "left")
nhanes_merged.head(100)
Out[29]:
SEQN diabetes LBDGLUSI LBDGLTSI BPQ020 BPQ080 DID060 RHD143 RIAGENDR RIDAGEYR RIDRETH1 DMDEDUC2 INDHHIN2 DR1TKCAL DR1TCARB DR1TSUGR DR1TTFAT DR1TFIBE DR1_320Z BMXBMI LBXBCD LBXBPB LBXTHG URXUAS LBXGLU LBXIN LBXMPAH LBXPFDO LBXPFNA LBXPFHS LBXPFDE LBXPFUA URXP01 URXP02 URXP03 URXP04 URXP06 URXBPH URXTRS URXBP3 URXBUP URXEPB URXMPB URXPPB URXCNP URXCOP URXECP URXMBP URXMC1 URXMEP URXMHH URXMHP URXMIB URXMNP URXMOH URXMZP ALQ130 PAQ710 SMQ020 DPQ010 SLQ050 MCQ300C HUQ051 HIQ011 HOQ065
0 83733 0 5.59 0.00 2.0 2.0 NaN NaN 1.0 53.0 3.0 3.0 4.0 2964.0 356.85 180.84 77.91 7.3 5.070000e+02 30.8 3.53 2.60 3.08 6.47 101.0 17.26 NaN NaN NaN NaN NaN NaN 26200.0 36800.0 1950.0 2540.0 452.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 6.0 5.000000e+00 1.0 1.000000e+00 2.0 1.0 5.397605e-79 2.0 1.0
1 83736 0 4.66 0.00 2.0 2.0 NaN 2.0 2.0 42.0 4.0 4.0 7.0 604.0 90.30 71.84 19.63 2.0 5.397605e-79 20.3 NaN NaN NaN NaN 84.0 5.42 0.07 0.07 0.5 0.60 0.10 0.20 NaN NaN NaN NaN NaN 1.30 28.8 12.10 0.07 20.10 79.0 64.5 2.2 5.6 14.1 56.2 2.00 244.7 8.7 2.00 73.7 0.64 7.8 63.60 1.0 4.000000e+00 2.0 1.000000e+00 1.0 9.0 2.000000e+00 1.0 2.0
2 83737 0 5.93 5.50 2.0 2.0 NaN NaN 2.0 72.0 1.0 2.0 14.0 1304.0 153.43 22.31 43.08 16.1 7.182000e+02 28.6 NaN NaN NaN NaN 107.0 8.24 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 5.397605e-79 2.0 5.397605e-79 2.0 1.0 2.000000e+00 2.0 1.0
3 83741 0 5.27 5.27 2.0 2.0 NaN NaN 1.0 22.0 4.0 4.0 7.0 2338.0 282.58 167.72 91.07 11.0 7.200000e+02 28.0 0.20 0.72 1.38 6.11 95.0 11.39 NaN NaN NaN NaN NaN NaN 1400.0 2020.0 83.0 81.0 41.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 8.0 1.000000e+00 1.0 5.397605e-79 2.0 1.0 2.000000e+00 2.0 2.0
4 83743 0 5.38 0.00 2.0 2.0 NaN NaN 1.0 18.0 5.0 NaN 15.0 NaN NaN NaN NaN NaN NaN 26.2 0.12 0.61 4.30 8.90 97.0 11.40 NaN NaN NaN NaN NaN NaN 777.0 772.0 20.0 34.0 43.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 1.000000e+00 2.0 NaN 2.0 NaN 2.000000e+00 1.0 1.0
5 83749 0 4.88 5.94 2.0 2.0 NaN NaN 2.0 17.0 3.0 NaN 14.0 2461.0 289.33 177.07 100.19 7.3 2.400000e+02 29.0 NaN NaN NaN NaN 88.0 16.00 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 3.000000e+00 NaN NaN 1.0 NaN 2.000000e+00 1.0 1.0
6 83753 0 5.71 0.00 NaN NaN NaN NaN 1.0 15.0 4.0 NaN 8.0 2277.0 272.01 145.02 85.91 10.7 1.974000e+03 24.5 0.11 0.47 0.37 15.59 103.0 5.43 NaN NaN NaN NaN NaN NaN 957.0 16700.0 208.0 525.0 191.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 2.000000e+00 NaN NaN NaN NaN 2.000000e+00 1.0 1.0
7 83761 0 5.27 9.10 2.0 2.0 NaN NaN 2.0 24.0 5.0 5.0 1.0 2055.0 234.44 122.15 94.82 11.5 8.700000e+02 25.3 1.11 1.80 3.22 32.34 95.0 13.23 NaN NaN NaN NaN NaN NaN 368.0 7260.0 25.0 87.0 109.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 1.0 8.000000e+00 2.0 5.397605e-79 2.0 2.0 5.397605e-79 9.0 2.0
8 83770 0 5.50 6.44 NaN NaN NaN NaN 1.0 15.0 4.0 NaN 4.0 885.0 179.37 106.99 17.64 9.5 2.535000e+02 18.3 NaN NaN NaN NaN 99.0 4.33 0.07 0.07 0.4 0.40 0.20 0.10 NaN NaN NaN NaN NaN 0.80 2.2 3.10 0.07 0.71 3.4 1.0 2.4 29.4 5.6 5.5 1.70 12.1 3.0 0.57 12.5 1.10 2.1 5.30 NaN 2.000000e+00 NaN NaN NaN NaN 1.000000e+00 1.0 1.0
9 83778 0 5.54 4.44 2.0 2.0 NaN NaN 1.0 16.0 2.0 NaN 5.0 3335.0 294.47 160.73 169.00 8.8 3.450000e+02 34.5 NaN NaN NaN NaN 100.0 28.46 0.07 0.07 0.5 0.40 0.07 0.07 NaN NaN NaN NaN NaN 0.70 2.5 30.70 0.07 0.71 16.0 1.6 1.1 6.8 5.3 8.3 1.50 93.0 5.9 2.00 14.5 1.00 2.9 6.90 NaN 3.000000e+00 NaN NaN 2.0 NaN 3.000000e+00 1.0 1.0
10 83781 0 5.38 6.77 2.0 2.0 NaN NaN 2.0 27.0 4.0 5.0 77.0 2802.0 341.58 154.88 107.08 22.8 2.028900e+03 34.0 NaN NaN NaN NaN 97.0 13.97 0.07 0.07 1.1 0.70 0.30 0.30 NaN NaN NaN NaN NaN 0.60 297.0 4.90 2.30 2.40 55.2 12.5 1.1 2.1 4.0 17.5 0.60 84.3 2.7 2.10 14.7 0.64 1.9 17.30 3.0 5.000000e+00 2.0 5.397605e-79 2.0 1.0 2.000000e+00 1.0 2.0
11 83790 1 22.10 30.10 2.0 2.0 NaN NaN 1.0 56.0 3.0 1.0 4.0 7455.0 1177.49 885.92 229.37 19.8 2.394000e+03 24.4 NaN NaN NaN NaN 397.0 4.36 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 4.000000e+00 1.0 2.000000e+00 2.0 1.0 5.397605e-79 1.0 1.0
12 83799 0 5.55 0.00 2.0 2.0 NaN NaN 2.0 37.0 2.0 4.0 14.0 NaN NaN NaN NaN NaN NaN 25.5 0.40 1.01 1.28 NaN 100.0 4.67 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 8.000000e+00 2.0 NaN 2.0 2.0 1.000000e+00 1.0 2.0
13 83809 0 5.22 6.77 2.0 2.0 NaN NaN 2.0 20.0 4.0 3.0 14.0 1445.0 62.87 13.13 95.92 4.4 8.700000e+02 26.2 0.10 0.21 0.28 NaN 94.0 7.19 0.07 0.07 0.5 0.70 0.10 0.07 NaN NaN NaN NaN NaN 0.50 16.8 0.28 3.70 119.10 329.8 34.0 0.3 6.9 1.8 1.0 1.20 105.7 1.1 0.57 2.3 0.64 0.7 1.00 NaN 3.000000e+00 2.0 5.397605e-79 2.0 1.0 2.000000e+00 2.0 2.0
14 83813 0 5.83 3.44 2.0 2.0 NaN NaN 1.0 24.0 3.0 4.0 6.0 2585.0 301.16 168.98 104.30 9.7 3.300000e+03 26.9 1.86 1.13 2.62 1.10 105.0 12.15 NaN NaN NaN NaN NaN NaN 5320.0 4010.0 223.0 219.0 43.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 2.0 5.397605e-79 1.0 5.397605e-79 2.0 2.0 2.000000e+00 1.0 2.0
15 83815 0 4.94 6.61 NaN NaN NaN NaN 2.0 15.0 4.0 NaN 6.0 1506.0 210.45 102.58 51.38 9.9 8.700000e+02 32.0 0.07 0.31 0.36 2.52 89.0 31.22 NaN NaN NaN NaN NaN NaN 193.0 1090.0 26.0 57.0 36.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 5.000000e+00 NaN NaN NaN NaN 5.397605e-79 1.0 2.0
16 83816 0 5.11 4.00 2.0 2.0 NaN NaN 1.0 27.0 3.0 4.0 7.0 2425.0 300.18 133.75 82.43 18.2 7.500000e+02 18.6 0.16 0.57 0.94 11.17 92.0 5.03 NaN NaN NaN NaN NaN NaN 7030.0 17400.0 824.0 816.0 245.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 2.0 2.000000e+00 2.0 1.000000e+00 1.0 2.0 5.397605e-79 1.0 2.0
17 83822 0 4.44 4.66 2.0 2.0 NaN NaN 2.0 20.0 4.0 4.0 6.0 833.0 112.45 59.85 21.30 2.9 4.635000e+02 22.2 0.35 0.40 0.58 3.44 80.0 11.76 NaN NaN NaN NaN NaN NaN 846.0 9800.0 52.0 137.0 73.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 3.000000e+00 2.0 1.000000e+00 2.0 2.0 1.000000e+00 2.0 2.0
18 83823 0 5.66 0.00 2.0 2.0 NaN NaN 2.0 29.0 1.0 1.0 3.0 NaN NaN NaN NaN NaN NaN 29.7 0.27 7.30 0.67 NaN 102.0 19.77 0.20 0.07 0.5 0.40 0.20 0.07 NaN NaN NaN NaN NaN 0.40 1.2 2.60 0.07 0.71 133.2 51.3 0.7 9.3 2.9 9.3 0.28 10.9 1.3 0.57 7.6 0.64 1.0 30.80 NaN 1.000000e+00 2.0 NaN 2.0 2.0 1.000000e+00 2.0 2.0
19 83825 0 5.61 5.55 2.0 2.0 NaN NaN 2.0 16.0 4.0 NaN 6.0 1009.0 159.10 77.54 32.04 4.2 3.600000e+02 21.6 NaN NaN NaN NaN 101.0 9.15 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 5.000000e+00 NaN NaN 2.0 NaN 5.397605e-79 1.0 1.0
20 83828 0 5.61 8.22 2.0 2.0 NaN 2.0 2.0 39.0 1.0 3.0 4.0 2068.0 256.01 98.96 87.27 14.6 1.800000e+03 27.2 NaN NaN NaN NaN 101.0 8.84 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 2.0 5.397605e-79 2.0 1.000000e+00 2.0 1.0 5.397605e-79 2.0 1.0
21 83830 0 5.33 6.22 NaN NaN NaN NaN 1.0 15.0 4.0 NaN 8.0 484.0 71.78 19.59 18.16 2.9 1.245000e+03 25.1 NaN NaN NaN NaN 96.0 4.30 0.10 0.07 0.5 0.60 0.10 0.07 NaN NaN NaN NaN NaN 1.60 4.2 16.60 0.07 0.71 18.4 1.9 1.5 3.0 4.5 9.7 0.28 17.6 3.3 0.57 7.5 0.64 1.9 6.50 NaN 1.000000e+00 NaN NaN NaN NaN 1.000000e+00 1.0 1.0
22 83831 0 5.05 0.00 NaN NaN NaN NaN 2.0 15.0 5.0 NaN 6.0 1169.0 117.55 60.58 40.17 3.8 1.500000e+02 30.5 0.07 0.40 0.20 2.65 91.0 38.70 NaN NaN NaN NaN NaN NaN 819.0 1820.0 126.0 248.0 144.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 5.397605e-79 NaN NaN NaN NaN 2.000000e+00 1.0 2.0
23 83832 0 5.77 7.44 2.0 2.0 NaN NaN 2.0 50.0 1.0 1.0 7.0 2852.0 347.64 133.32 110.36 27.3 9.150000e+02 42.6 0.25 1.34 2.93 32.39 104.0 20.46 NaN NaN NaN NaN NaN NaN 1990.0 18400.0 60.0 195.0 102.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 4.0 8.000000e+00 2.0 5.397605e-79 2.0 1.0 1.000000e+00 2.0 2.0
24 83833 0 5.16 4.55 NaN NaN NaN NaN 1.0 14.0 3.0 NaN 10.0 2579.0 354.77 149.74 90.66 18.3 5.397605e-79 17.7 0.07 0.69 0.20 NaN 93.0 9.65 0.07 0.07 0.7 1.30 0.20 0.07 NaN NaN NaN NaN NaN 2.60 6.3 84.30 0.07 0.71 5.7 0.4 3.3 4.4 11.1 12.8 0.40 34.5 5.7 0.90 26.5 0.64 4.5 2.60 NaN 3.000000e+00 NaN NaN NaN NaN 1.000000e+00 1.0 1.0
25 83835 1 5.22 11.50 NaN NaN NaN NaN 2.0 13.0 4.0 NaN 2.0 1099.0 144.23 45.45 43.82 6.6 5.070000e+02 29.8 NaN NaN NaN NaN 94.0 82.68 0.20 0.07 1.6 0.50 2.20 1.70 NaN NaN NaN NaN NaN 1.00 24.5 2.80 0.07 1.90 154.9 11.0 1.5 7.3 5.4 18.9 0.90 23.4 4.9 0.57 17.5 0.64 3.3 17.00 NaN 5.397605e-79 NaN NaN NaN NaN 2.000000e+00 1.0 2.0
26 83836 0 5.83 6.27 2.0 2.0 NaN NaN 2.0 18.0 1.0 NaN 14.0 1365.0 182.55 123.90 54.34 2.2 1.500000e+02 28.0 0.31 0.65 0.89 6.34 105.0 18.53 NaN NaN NaN NaN NaN NaN 1180.0 13300.0 193.0 299.0 60.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 4.0 3.000000e+00 2.0 1.000000e+00 2.0 NaN 3.000000e+00 1.0 1.0
27 83841 0 6.00 0.00 NaN NaN NaN NaN 1.0 13.0 5.0 NaN 15.0 3612.0 501.32 91.85 113.38 34.0 9.750000e+02 20.6 0.12 0.52 2.41 NaN 108.0 14.37 0.07 0.07 0.6 1.30 0.20 0.20 NaN NaN NaN NaN NaN 5.40 1.2 15.60 0.20 0.71 6.7 1.1 4.7 17.4 15.6 19.7 1.50 58.1 6.8 1.10 14.3 0.64 4.5 5.50 NaN NaN NaN NaN NaN NaN 1.000000e+00 1.0 2.0
28 83844 0 5.77 6.00 2.0 2.0 NaN NaN 1.0 27.0 1.0 2.0 7.0 1308.0 189.51 39.27 39.40 23.6 1.014000e+03 23.1 0.32 0.92 1.19 26.74 104.0 9.93 NaN NaN NaN NaN NaN NaN 340.0 5360.0 77.0 152.0 67.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 12.0 2.000000e+00 2.0 1.000000e+00 1.0 2.0 4.000000e+00 2.0 2.0
29 83847 0 5.50 4.05 2.0 2.0 NaN NaN 1.0 18.0 3.0 NaN 15.0 3479.0 410.94 176.61 146.89 18.6 3.750000e+02 20.8 NaN NaN NaN NaN 99.0 7.15 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 1.000000e+00 2.0 5.397605e-79 2.0 NaN 4.000000e+00 1.0 1.0
30 83851 0 4.39 5.22 2.0 2.0 NaN NaN 2.0 37.0 3.0 3.0 8.0 2079.0 253.72 119.36 86.20 16.8 5.397605e-79 35.3 0.67 0.43 0.71 2.81 79.0 NaN NaN NaN NaN NaN NaN NaN 9930.0 9770.0 890.0 1090.0 166.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 1.0 1.000000e+00 1.0 5.397605e-79 2.0 2.0 2.000000e+00 1.0 2.0
31 83854 0 5.59 6.61 2.0 2.0 NaN NaN 2.0 46.0 1.0 5.0 9.0 3127.0 299.12 141.47 175.33 19.6 6.652500e+02 41.6 0.07 0.50 0.65 NaN 101.0 33.07 0.30 0.07 0.7 0.50 0.30 0.07 NaN NaN NaN NaN NaN 1.20 18.0 166.50 0.07 0.71 648.3 163.6 3.2 59.8 28.4 14.4 3.10 91.8 16.8 1.60 13.8 1.20 10.6 2.20 1.0 2.000000e+00 2.0 5.397605e-79 1.0 2.0 3.000000e+00 1.0 1.0
32 83855 0 5.33 4.94 NaN NaN NaN NaN 2.0 12.0 3.0 NaN 7.0 1667.0 195.55 84.74 76.57 14.1 7.800000e+02 16.2 0.07 0.27 0.82 NaN 96.0 7.56 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 5.397605e-79 NaN NaN NaN NaN 2.000000e+00 1.0 1.0
33 83860 0 6.11 6.94 2.0 2.0 NaN NaN 1.0 41.0 4.0 4.0 15.0 1707.0 172.89 60.01 77.86 8.9 2.286000e+03 40.7 0.72 0.85 1.38 NaN 110.0 68.80 0.07 0.07 0.6 2.50 0.10 0.07 NaN NaN NaN NaN NaN 0.60 3.9 4.60 0.07 3.30 155.7 24.6 1.0 4.4 12.7 5.4 1.70 425.8 11.8 3.90 3.9 0.64 6.5 1.30 3.0 2.000000e+00 1.0 5.397605e-79 2.0 2.0 2.000000e+00 1.0 1.0
34 83862 0 5.33 0.00 2.0 2.0 NaN NaN 2.0 19.0 2.0 NaN 99.0 NaN NaN NaN NaN NaN NaN 24.4 NaN NaN NaN NaN 96.0 14.47 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 1.0 2.000000e+00 2.0 5.397605e-79 2.0 NaN 5.397605e-79 1.0 1.0
35 83863 0 5.38 3.61 2.0 2.0 NaN NaN 1.0 35.0 1.0 3.0 14.0 2458.0 262.22 84.27 113.90 17.6 1.014000e+03 31.1 0.24 0.86 0.94 NaN 97.0 22.23 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 3.0 1.000000e+00 2.0 3.000000e+00 2.0 1.0 5.397605e-79 2.0 1.0
36 83866 0 5.00 0.00 2.0 2.0 NaN NaN 1.0 40.0 4.0 4.0 14.0 1977.0 162.04 47.63 86.01 8.7 1.005000e+03 30.7 0.50 1.61 2.03 NaN 90.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 8.40 1.2 47.10 0.07 0.71 13.8 5.0 10.6 13.4 46.0 18.3 2.60 303.1 45.4 5.60 26.4 0.64 29.2 8.00 2.0 2.000000e+00 1.0 5.397605e-79 2.0 2.0 1.000000e+00 2.0 1.0
37 83874 0 5.50 0.00 2.0 2.0 NaN NaN 1.0 54.0 1.0 2.0 5.0 NaN NaN NaN NaN NaN NaN 30.2 NaN NaN NaN NaN 99.0 17.13 0.07 0.07 1.0 1.20 0.20 0.10 NaN NaN NaN NaN NaN 0.60 4.5 27.10 0.20 4.20 77.8 10.2 8.1 6.6 3.8 1.5 0.50 15.0 1.1 0.57 3.3 0.64 0.7 2.40 NaN 2.000000e+00 1.0 NaN 2.0 2.0 3.000000e+00 1.0 2.0
38 83887 0 5.27 6.77 2.0 2.0 NaN NaN 2.0 51.0 4.0 2.0 99.0 1277.0 243.30 72.63 17.44 11.7 1.014000e+03 23.1 NaN NaN NaN NaN 95.0 8.67 0.10 0.07 0.5 1.70 0.20 0.40 NaN NaN NaN NaN NaN 0.20 227.9 2.70 0.07 0.71 3.8 0.9 1.0 3.6 2.0 2.2 0.28 12.5 0.6 0.57 2.6 0.64 0.4 1.90 NaN 2.000000e+00 2.0 1.000000e+00 2.0 2.0 3.000000e+00 1.0 2.0
39 83894 0 5.50 8.99 2.0 2.0 NaN NaN 1.0 60.0 4.0 3.0 8.0 1247.0 127.50 61.63 19.28 5.4 3.900000e+02 19.7 1.11 4.33 1.37 NaN 99.0 6.69 0.70 0.07 0.9 3.20 0.20 0.20 NaN NaN NaN NaN NaN 2.50 1.2 6.10 0.07 124.40 283.5 34.4 8.0 17.5 16.3 81.6 2.90 124.6 13.1 2.70 63.6 2.20 3.1 36.40 5.0 2.000000e+00 1.0 5.397605e-79 2.0 1.0 1.000000e+00 1.0 1.0
40 83897 0 5.11 5.22 2.0 2.0 NaN NaN 2.0 29.0 3.0 5.0 7.0 2025.0 164.81 30.50 95.96 15.6 3.045000e+03 30.7 0.50 0.59 0.20 1.30 92.0 13.46 NaN NaN NaN NaN NaN NaN 268.0 311.0 40.0 74.0 71.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 2.0 2.000000e+00 1.0 5.397605e-79 2.0 1.0 2.000000e+00 1.0 1.0
41 83908 0 5.66 5.88 2.0 2.0 NaN NaN 1.0 51.0 4.0 3.0 6.0 1466.0 231.49 85.91 37.70 11.5 9.450000e+02 24.7 0.17 0.73 5.75 NaN 102.0 7.73 0.07 0.07 1.3 1.70 0.90 1.30 NaN NaN NaN NaN NaN 2.10 3.8 10.10 0.07 0.71 16.2 7.3 2.1 4.9 11.2 45.8 1.10 127.3 7.5 2.00 63.9 0.64 4.4 4.70 1.0 5.397605e-79 2.0 5.397605e-79 2.0 1.0 2.000000e+00 1.0 2.0
42 83909 0 5.77 5.55 2.0 2.0 NaN NaN 2.0 49.0 3.0 4.0 77.0 3197.0 186.96 19.20 155.03 10.4 5.397605e-79 37.8 0.59 0.69 0.61 NaN 104.0 16.63 0.07 0.07 0.4 0.70 0.20 0.10 NaN NaN NaN NaN NaN 3.20 1.2 1051.40 1.30 25.70 310.4 84.2 0.9 2.4 3.7 12.3 0.28 20.0 2.5 0.57 3.3 0.64 1.9 5.60 NaN 5.397605e-79 1.0 NaN 2.0 2.0 1.000000e+00 1.0 7.0
43 83911 1 18.40 0.00 2.0 2.0 NaN 2.0 2.0 43.0 4.0 4.0 14.0 1664.0 204.07 74.96 72.87 14.8 3.042000e+03 30.7 NaN NaN NaN NaN 331.0 7.03 0.07 0.07 0.2 0.07 0.07 0.07 NaN NaN NaN NaN NaN 1.10 1.2 0.28 0.07 5.90 28.9 9.7 0.6 14.9 8.7 5.4 3.10 66.7 4.2 1.10 5.1 1.10 3.2 1.60 1.0 3.000000e+00 2.0 5.397605e-79 2.0 1.0 5.397605e-79 2.0 2.0
44 83919 0 4.94 4.27 2.0 2.0 NaN NaN 1.0 19.0 1.0 NaN 6.0 1053.0 98.48 0.67 59.09 8.2 7.605000e+02 21.6 0.18 0.41 0.20 1.31 89.0 2.47 NaN NaN NaN NaN NaN NaN 1650.0 2190.0 110.0 270.0 56.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 5.0 5.397605e-79 1.0 2.000000e+00 1.0 NaN 2.000000e+00 1.0 1.0
45 83931 0 5.44 0.00 2.0 2.0 NaN NaN 1.0 37.0 2.0 2.0 7.0 1895.0 259.34 142.98 52.45 4.5 5.397605e-79 26.0 0.49 1.75 1.22 NaN 98.0 7.72 0.07 0.07 2.3 1.50 0.20 0.30 NaN NaN NaN NaN NaN 1.60 2.3 36.40 0.07 1.70 755.9 96.5 1.1 8.2 9.9 17.7 1.20 54.1 7.4 1.10 20.2 0.64 4.6 9.50 2.0 3.000000e+00 1.0 1.000000e+00 2.0 1.0 5.397605e-79 2.0 1.0
46 83934 0 4.94 5.16 2.0 2.0 NaN 2.0 2.0 27.0 4.0 3.0 3.0 NaN NaN NaN NaN NaN NaN 17.2 0.14 0.22 0.52 NaN 89.0 5.83 0.07 0.07 1.5 1.10 0.07 0.07 NaN NaN NaN NaN NaN 0.30 1.2 4.00 0.07 0.71 33.0 7.6 0.5 1.8 2.8 44.1 0.40 17.2 2.0 0.57 13.9 0.64 1.5 36.10 NaN 1.000000e+00 2.0 5.397605e-79 2.0 2.0 2.000000e+00 2.0 2.0
47 83936 0 5.22 7.11 NaN NaN NaN NaN 1.0 12.0 3.0 NaN 15.0 2407.0 392.35 150.44 73.82 36.0 2.022000e+03 23.5 NaN NaN NaN NaN 94.0 8.35 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 4.000000e+00 NaN NaN NaN NaN 3.000000e+00 1.0 1.0
48 83943 0 5.33 4.61 2.0 2.0 NaN NaN 2.0 17.0 1.0 NaN 4.0 2114.0 212.53 46.03 102.80 26.0 9.750000e+02 26.8 NaN NaN NaN NaN 96.0 12.85 0.07 0.07 0.3 0.40 0.07 0.07 NaN NaN NaN NaN NaN 0.40 80.4 50.70 0.07 0.71 4.5 1.5 0.5 7.8 1.2 1.8 0.28 6.3 0.6 0.57 2.9 0.64 0.4 0.21 NaN 5.397605e-79 NaN NaN 2.0 NaN 1.000000e+00 1.0 1.0
49 83985 0 4.77 0.00 2.0 2.0 NaN NaN 1.0 55.0 3.0 5.0 15.0 NaN NaN NaN NaN NaN NaN 18.5 NaN NaN NaN NaN 86.0 2.51 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 1.000000e+00 2.0 NaN 2.0 1.0 5.397605e-79 1.0 1.0
50 83988 0 6.11 5.22 2.0 2.0 NaN NaN 1.0 21.0 1.0 3.0 8.0 3083.0 323.89 82.30 126.09 34.2 5.760000e+03 33.0 NaN NaN NaN NaN 110.0 13.94 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 5.0 5.000000e+00 1.0 5.397605e-79 2.0 2.0 2.000000e+00 1.0 2.0
51 83991 0 5.44 7.16 NaN NaN NaN NaN 2.0 14.0 5.0 NaN 6.0 1419.0 202.27 87.36 41.14 7.6 6.750000e+02 30.4 0.15 0.46 0.20 5.36 98.0 17.44 NaN NaN NaN NaN NaN NaN 387.0 2400.0 33.0 69.0 52.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 1.000000e+00 NaN NaN NaN NaN 1.000000e+00 1.0 1.0
52 83995 1 6.22 11.20 2.0 2.0 NaN NaN 1.0 43.0 4.0 4.0 15.0 2362.0 103.39 30.58 103.12 3.2 5.397605e-79 43.2 NaN NaN NaN NaN 112.0 41.03 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 3.0 3.000000e+00 2.0 5.397605e-79 1.0 1.0 2.000000e+00 1.0 1.0
53 84018 0 5.83 0.00 2.0 2.0 NaN NaN 1.0 27.0 3.0 5.0 14.0 2806.0 338.34 115.88 114.59 14.3 1.800000e+03 29.6 0.51 0.51 2.02 1.97 105.0 11.55 NaN NaN NaN NaN NaN NaN 2640.0 7270.0 231.0 479.0 104.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 8.0 5.397605e-79 1.0 5.397605e-79 1.0 1.0 3.000000e+00 1.0 2.0
54 84029 0 5.16 5.77 2.0 2.0 NaN 2.0 2.0 28.0 1.0 3.0 NaN 1812.0 243.64 100.74 55.31 11.0 1.374000e+03 20.3 0.17 0.14 0.20 3.62 93.0 7.20 NaN NaN NaN NaN NaN NaN 267.0 5260.0 22.0 116.0 91.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 1.0 5.397605e-79 2.0 5.397605e-79 2.0 1.0 5.397605e-79 2.0 NaN
55 84030 0 6.38 8.66 2.0 2.0 NaN NaN 1.0 46.0 1.0 1.0 77.0 742.0 96.09 42.70 21.52 5.1 3.600000e+02 25.1 0.20 1.16 1.44 NaN 115.0 10.78 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 10.0 3.000000e+00 2.0 5.397605e-79 2.0 2.0 2.000000e+00 2.0 2.0
56 84032 0 5.03 4.00 2.0 2.0 NaN NaN 1.0 51.0 1.0 4.0 77.0 1515.0 126.93 53.95 46.50 11.4 3.840000e+03 26.8 NaN NaN NaN NaN 91.0 8.40 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 3.0 2.000000e+00 1.0 5.397605e-79 2.0 2.0 5.397605e-79 2.0 2.0
57 84033 0 5.22 5.16 2.0 2.0 NaN 2.0 2.0 41.0 5.0 4.0 9.0 1995.0 243.05 124.18 85.91 11.2 1.080000e+03 33.6 0.21 0.48 0.20 3.24 94.0 8.20 NaN NaN NaN NaN NaN NaN 469.0 1390.0 42.0 103.0 69.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 1.0 5.397605e-79 1.0 5.397605e-79 1.0 2.0 2.000000e+00 2.0 1.0
58 84039 0 5.88 5.50 2.0 2.0 NaN NaN 1.0 36.0 3.0 2.0 7.0 4184.0 470.50 215.12 201.79 32.1 5.397605e-79 28.2 NaN NaN NaN NaN 106.0 11.95 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 4.0 5.000000e+00 2.0 5.397605e-79 2.0 2.0 1.000000e+00 1.0 1.0
59 84042 0 5.61 7.27 2.0 2.0 NaN NaN 1.0 75.0 3.0 2.0 6.0 2744.0 341.20 168.44 63.50 13.3 5.397605e-79 19.3 0.92 2.55 0.20 4.72 101.0 3.04 NaN NaN NaN NaN NaN NaN 2330.0 3070.0 135.0 222.0 100.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 6.0 5.000000e+00 1.0 5.397605e-79 2.0 1.0 5.397605e-79 1.0 1.0
60 84047 0 6.22 6.77 2.0 2.0 NaN NaN 2.0 72.0 5.0 5.0 15.0 1655.0 251.02 120.96 45.54 16.6 5.850000e+02 22.1 0.47 0.92 2.74 NaN 112.0 17.55 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 2.000000e+00 2.0 5.397605e-79 2.0 2.0 1.000000e+00 1.0 1.0
61 84051 0 5.31 3.44 2.0 2.0 NaN NaN 2.0 55.0 1.0 5.0 9.0 1779.0 232.80 79.64 65.42 16.5 9.600000e+02 32.5 2.08 1.88 0.38 NaN 96.0 6.23 0.20 0.07 1.0 1.20 0.07 0.07 NaN NaN NaN NaN NaN 0.14 4.8 88.40 0.40 0.71 24.7 1.2 0.7 3.4 3.3 3.0 0.28 39.7 1.5 0.57 11.0 0.64 1.0 2.90 1.0 4.000000e+00 1.0 5.397605e-79 2.0 9.0 5.397605e-79 1.0 1.0
62 84056 0 5.44 7.22 NaN NaN NaN NaN 2.0 14.0 5.0 NaN 10.0 1399.0 201.74 21.73 51.50 6.6 1.521000e+03 28.6 0.63 0.62 0.96 15.25 98.0 16.53 NaN NaN NaN NaN NaN NaN 193.0 1640.0 18.0 47.0 27.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 5.000000e+00 NaN NaN NaN NaN 5.397605e-79 2.0 2.0
63 84058 0 4.94 0.00 2.0 2.0 NaN NaN 2.0 21.0 3.0 4.0 9.0 NaN NaN NaN NaN NaN NaN 23.1 0.18 0.40 0.20 NaN 89.0 3.56 0.07 0.07 0.4 0.60 0.10 0.07 NaN NaN NaN NaN NaN 4.80 9.1 64.20 18.50 17.50 499.3 623.9 35.2 36.4 14.5 13.1 7.20 74.2 11.0 1.00 18.0 1.90 7.4 5.20 3.0 2.000000e+00 2.0 5.397605e-79 1.0 2.0 6.000000e+00 1.0 2.0
64 84061 0 5.25 0.00 2.0 2.0 NaN NaN 2.0 50.0 3.0 4.0 9.0 2282.0 239.24 117.41 107.61 20.1 5.397605e-79 27.2 0.28 0.85 1.95 NaN 95.0 7.74 0.07 0.07 0.6 0.60 0.20 0.07 NaN NaN NaN NaN NaN 0.80 33.8 4.00 0.07 0.71 11.3 1.8 0.3 1.8 2.2 1.9 0.28 7.4 1.5 0.90 1.4 0.64 0.9 4.20 NaN 5.000000e+00 1.0 5.397605e-79 1.0 2.0 6.000000e+00 1.0 1.0
65 84062 1 9.16 16.60 2.0 2.0 NaN NaN 2.0 45.0 1.0 4.0 3.0 3449.0 386.92 121.48 177.37 45.7 6.300000e+02 33.9 0.51 0.41 0.72 NaN 165.0 12.36 0.07 0.07 0.4 0.40 0.20 0.07 NaN NaN NaN NaN NaN 0.50 5.2 30.80 0.20 0.71 83.7 13.2 0.9 3.9 10.6 6.2 0.28 6.1 6.0 1.20 6.0 0.64 3.7 2.00 NaN 1.000000e+00 2.0 5.397605e-79 2.0 2.0 5.397605e-79 1.0 2.0
66 84066 0 5.44 5.88 NaN NaN NaN NaN 2.0 14.0 3.0 NaN 15.0 2806.0 349.75 92.54 101.37 16.8 1.365000e+03 18.7 NaN NaN NaN NaN 98.0 8.00 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 5.397605e-79 1.0 1.0
67 84070 0 4.77 6.83 2.0 2.0 NaN NaN 1.0 40.0 3.0 4.0 12.0 2159.0 181.17 7.02 18.25 4.3 3.600000e+02 29.4 3.39 1.78 2.89 NaN 86.0 2.65 0.07 0.07 3.0 3.50 0.10 0.07 NaN NaN NaN NaN NaN 0.40 1.2 3.40 0.07 13.50 17.5 1.8 1.1 4.1 3.1 6.4 0.80 6.3 2.3 0.57 3.4 0.64 0.9 6.00 3.0 2.000000e+00 1.0 5.397605e-79 2.0 2.0 5.397605e-79 2.0 2.0
68 84073 0 5.71 7.11 2.0 2.0 NaN 2.0 2.0 39.0 2.0 3.0 10.0 2851.0 365.52 180.11 121.51 28.1 5.397605e-79 31.9 0.15 0.25 0.29 2.15 103.0 22.59 NaN NaN NaN NaN NaN NaN 1040.0 16500.0 76.0 334.0 89.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 2.0 5.397605e-79 1.0 5.397605e-79 2.0 1.0 5.397605e-79 1.0 1.0
69 84087 0 5.33 5.55 2.0 2.0 NaN NaN 1.0 36.0 5.0 5.0 15.0 2375.0 201.49 95.13 151.41 23.8 5.397605e-79 22.6 0.34 0.98 4.21 11.85 96.0 2.00 NaN NaN NaN NaN NaN NaN 340.0 1140.0 87.0 134.0 94.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 3.0 1.000000e+00 1.0 5.397605e-79 2.0 2.0 2.000000e+00 1.0 2.0
70 84100 0 5.88 7.38 NaN NaN NaN NaN 1.0 13.0 2.0 NaN 3.0 2304.0 259.13 89.52 102.93 19.4 8.700000e+02 28.8 0.07 0.48 2.03 NaN 106.0 55.43 0.07 0.07 0.4 0.70 0.10 0.07 NaN NaN NaN NaN NaN 0.70 1.2 67.20 0.07 0.71 17.1 1.0 2.1 6.7 14.3 11.5 0.70 296.0 5.4 1.10 15.9 0.64 4.8 9.90 NaN 2.000000e+00 NaN NaN NaN NaN 1.000000e+00 2.0 1.0
71 84102 0 5.05 6.38 NaN NaN NaN NaN 2.0 13.0 4.0 NaN 6.0 466.0 58.33 6.27 16.96 4.0 4.350000e+02 33.6 NaN NaN NaN NaN 91.0 29.07 0.07 0.07 0.2 1.50 0.07 0.07 NaN NaN NaN NaN NaN 0.90 1.2 5.40 0.07 0.71 2.8 0.5 1.5 1.8 2.3 10.0 0.60 5.5 2.4 0.57 13.2 0.64 1.6 2.40 NaN 2.000000e+00 NaN NaN NaN NaN 1.000000e+00 1.0 2.0
72 84106 0 5.50 0.00 2.0 2.0 NaN NaN 1.0 57.0 5.0 5.0 15.0 NaN NaN NaN NaN NaN NaN 24.6 0.66 0.90 8.13 58.99 99.0 3.55 NaN NaN NaN NaN NaN NaN 1400.0 4280.0 96.0 157.0 70.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 2.000000e+00 1.0 NaN 2.0 2.0 1.000000e+00 1.0 1.0
73 84121 0 4.74 5.77 2.0 2.0 NaN NaN 1.0 18.0 1.0 NaN 10.0 1078.0 154.32 40.60 35.62 13.5 7.200000e+02 26.6 0.07 0.24 0.20 3.72 85.0 19.19 NaN NaN NaN NaN NaN NaN 471.0 13000.0 154.0 349.0 146.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 3.000000e+00 2.0 5.397605e-79 2.0 NaN 3.000000e+00 1.0 1.0
74 84127 0 4.91 5.44 NaN NaN NaN NaN 2.0 12.0 1.0 NaN 10.0 1148.0 183.80 84.24 36.92 9.2 5.397605e-79 19.9 0.07 0.27 0.20 4.83 88.0 16.26 NaN NaN NaN NaN NaN NaN 464.0 14000.0 120.0 639.0 113.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 5.000000e+00 NaN NaN NaN NaN 3.000000e+00 1.0 1.0
75 84130 0 5.00 2.00 2.0 2.0 NaN 2.0 2.0 26.0 3.0 3.0 10.0 1470.0 140.87 62.64 69.19 10.5 3.885000e+03 20.5 NaN NaN NaN NaN 90.0 8.18 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 2.0 5.000000e+00 1.0 5.397605e-79 2.0 1.0 1.000000e+00 1.0 1.0
76 84134 0 5.33 4.39 NaN NaN NaN NaN 1.0 13.0 3.0 NaN 15.0 1395.0 192.49 41.78 48.42 11.6 1.455000e+03 18.3 NaN NaN NaN NaN 96.0 10.63 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 5.397605e-79 NaN NaN NaN NaN 2.000000e+00 1.0 1.0
77 84142 0 4.50 6.94 2.0 2.0 NaN NaN 1.0 24.0 3.0 3.0 15.0 6223.0 706.87 430.14 279.25 39.0 1.545000e+03 32.8 0.18 0.58 0.37 5.01 81.0 38.27 NaN NaN NaN NaN NaN NaN 662.0 17100.0 61.0 148.0 134.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 3.0 5.000000e+00 2.0 5.397605e-79 2.0 2.0 1.000000e+00 1.0 2.0
78 84143 0 6.11 6.88 2.0 2.0 NaN 2.0 2.0 35.0 5.0 1.0 9.0 1786.0 166.78 45.53 81.53 11.7 8.400000e+02 28.9 2.37 0.71 0.33 0.64 110.0 11.50 NaN NaN NaN NaN NaN NaN 4830.0 4600.0 163.0 305.0 39.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 5.397605e-79 1.0 5.397605e-79 2.0 1.0 2.000000e+00 1.0 2.0
79 84151 0 4.77 0.00 2.0 2.0 NaN NaN 2.0 53.0 3.0 5.0 15.0 1721.0 75.88 18.45 106.42 16.2 1.521000e+03 25.3 NaN NaN NaN NaN 86.0 2.95 0.07 0.07 2.7 1.30 0.30 0.20 NaN NaN NaN NaN NaN 0.60 1.2 1430.80 0.07 11.10 31.7 8.7 0.2 0.9 3.7 4.2 0.28 12.5 2.0 1.30 2.7 0.64 1.3 0.60 2.0 5.397605e-79 2.0 5.397605e-79 2.0 2.0 3.000000e+00 1.0 1.0
80 84168 1 8.10 0.00 2.0 2.0 NaN NaN 1.0 61.0 5.0 1.0 5.0 NaN NaN NaN NaN NaN NaN 16.4 NaN NaN NaN NaN 146.0 6.12 0.07 0.07 1.2 3.00 0.80 0.40 NaN NaN NaN NaN NaN 3.10 1.2 1.70 0.07 0.71 132.8 2.8 0.6 3.2 6.3 13.9 0.40 17.8 3.6 0.57 12.1 0.64 2.0 6.80 NaN 4.000000e+00 1.0 5.397605e-79 2.0 2.0 5.397605e-79 2.0 2.0
81 84181 0 5.77 0.00 2.0 2.0 NaN NaN 1.0 35.0 4.0 3.0 1.0 NaN NaN NaN NaN NaN NaN 30.4 NaN NaN NaN NaN 104.0 8.69 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 2.0 8.000000e+00 1.0 3.000000e+00 2.0 1.0 2.000000e+00 2.0 2.0
82 84191 0 5.27 5.61 NaN NaN NaN NaN 2.0 12.0 4.0 NaN 7.0 425.0 51.16 4.28 16.57 2.7 6.150000e+02 26.5 0.07 0.40 0.20 1.65 95.0 13.09 NaN NaN NaN NaN NaN NaN 418.0 9440.0 77.0 186.0 81.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 1.000000e+00 1.0 2.0
83 84195 0 6.22 9.44 2.0 2.0 NaN NaN 1.0 45.0 4.0 4.0 5.0 1560.0 208.14 95.93 68.05 9.3 1.051990e+03 31.6 NaN NaN NaN NaN 112.0 10.46 0.20 0.07 0.3 2.20 0.07 0.07 NaN NaN NaN NaN NaN 1.60 4.2 9.40 0.07 0.71 6.9 2.5 2.2 64.2 17.2 13.7 5.60 49.9 8.6 2.60 27.8 1.60 5.8 13.90 NaN 1.000000e+00 2.0 1.000000e+00 2.0 1.0 2.000000e+00 1.0 2.0
84 84196 0 5.72 3.39 2.0 2.0 NaN NaN 1.0 32.0 5.0 2.0 14.0 2111.0 357.14 95.73 39.77 16.5 7.800000e+02 23.5 NaN NaN NaN NaN 103.0 6.07 0.07 0.07 0.7 0.80 0.20 0.20 NaN NaN NaN NaN NaN 1.20 2.0 5.30 0.07 0.71 5.5 0.3 0.7 13.7 24.6 18.0 0.70 24.0 16.9 7.00 5.1 2.20 9.8 5.20 3.0 3.000000e+00 1.0 5.397605e-79 2.0 1.0 5.397605e-79 1.0 1.0
85 84197 0 6.83 8.27 2.0 2.0 NaN NaN 1.0 59.0 3.0 5.0 15.0 1485.0 131.36 68.70 51.28 7.6 5.397605e-79 35.4 0.17 0.93 1.22 239.58 123.0 16.82 NaN NaN NaN NaN NaN NaN 15000.0 16600.0 1230.0 1040.0 1310.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 1.0 4.000000e+00 2.0 5.397605e-79 2.0 2.0 3.000000e+00 1.0 1.0
86 84221 0 5.11 5.16 2.0 2.0 NaN NaN 2.0 16.0 3.0 NaN 8.0 1521.0 256.45 148.59 45.18 11.1 1.620000e+03 31.6 NaN NaN NaN NaN 92.0 5.82 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 5.397605e-79 NaN NaN 2.0 NaN 5.397605e-79 2.0 1.0
87 84243 0 5.27 5.88 2.0 2.0 NaN NaN 2.0 16.0 5.0 NaN 7.0 656.0 71.78 23.40 20.67 2.6 2.535000e+02 26.5 0.27 0.58 4.50 NaN 95.0 14.36 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 5.397605e-79 NaN NaN 2.0 NaN 5.397605e-79 1.0 1.0
88 84245 0 5.72 6.72 2.0 2.0 NaN NaN 2.0 43.0 3.0 3.0 15.0 1794.0 134.75 72.21 105.83 12.7 2.010000e+03 28.9 0.19 0.75 0.45 NaN 103.0 7.07 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 8.000000e+00 2.0 NaN 2.0 2.0 5.397605e-79 1.0 1.0
89 84251 0 4.94 0.00 2.0 2.0 NaN NaN 1.0 20.0 4.0 2.0 6.0 NaN NaN NaN NaN NaN NaN 22.2 NaN NaN NaN NaN 89.0 1.71 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 3.0 1.000000e+00 2.0 2.000000e+00 1.0 2.0 1.000000e+00 2.0 2.0
90 84269 0 5.72 4.61 2.0 2.0 NaN NaN 1.0 26.0 5.0 4.0 4.0 1946.0 277.89 217.33 65.57 7.4 8.700000e+02 22.1 NaN NaN NaN NaN 103.0 3.71 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 2.0 2.000000e+00 1.0 1.000000e+00 1.0 2.0 5.397605e-79 1.0 2.0
91 84270 0 5.83 7.55 NaN NaN NaN NaN 1.0 12.0 3.0 NaN 2.0 2101.0 278.46 160.27 73.33 8.1 4.800000e+02 19.8 0.13 0.68 0.20 1.76 105.0 9.09 NaN NaN NaN NaN NaN NaN 495.0 4300.0 38.0 91.0 41.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 1.000000e+00 NaN NaN NaN NaN 2.000000e+00 1.0 2.0
92 84273 0 5.94 0.00 2.0 2.0 NaN NaN 1.0 63.0 5.0 2.0 6.0 1493.0 128.16 42.25 64.47 11.7 4.950000e+02 22.1 NaN NaN NaN NaN 107.0 5.39 0.07 0.07 2.6 2.10 1.70 1.30 NaN NaN NaN NaN NaN 0.70 2.0 5136.60 0.90 482.80 916.9 31.7 3.3 5.6 24.8 192.5 2.40 806.7 15.4 11.50 36.2 1.90 10.1 11.80 NaN 2.000000e+00 1.0 NaN 2.0 2.0 4.000000e+00 1.0 2.0
93 84278 0 5.66 7.22 NaN NaN NaN NaN 2.0 14.0 2.0 NaN 6.0 1857.0 195.63 70.50 96.79 13.4 5.397605e-79 29.6 0.20 0.39 0.20 NaN 102.0 15.54 0.07 0.07 0.2 0.50 0.07 0.07 NaN NaN NaN NaN NaN 0.40 22.0 24.70 0.10 1.20 61.5 13.8 3.2 12.0 3.7 7.3 1.30 15.4 2.8 0.57 4.5 1.20 1.7 5.80 NaN 2.000000e+00 NaN NaN NaN NaN 2.000000e+00 1.0 2.0
94 84287 0 5.44 3.11 2.0 2.0 NaN NaN 1.0 34.0 3.0 3.0 8.0 4497.0 436.58 244.52 193.05 21.5 5.397605e-79 25.6 0.56 3.48 0.70 2.36 98.0 5.02 NaN NaN NaN NaN NaN NaN 4750.0 6890.0 367.0 512.0 104.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 6.0 3.000000e+00 1.0 5.397605e-79 2.0 1.0 1.000000e+00 1.0 3.0
95 84299 0 5.88 5.66 2.0 2.0 NaN NaN 1.0 51.0 3.0 4.0 14.0 4911.0 617.44 347.51 210.35 25.5 5.070000e+02 35.0 0.47 0.78 0.33 3.68 106.0 30.17 NaN NaN NaN NaN NaN NaN 8520.0 12900.0 800.0 1220.0 594.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 8.0 2.000000e+00 1.0 5.397605e-79 2.0 2.0 5.397605e-79 1.0 1.0
96 84302 0 5.83 0.00 2.0 2.0 NaN NaN 1.0 74.0 3.0 3.0 9.0 1761.0 210.00 146.14 79.66 4.5 5.397605e-79 25.5 NaN NaN NaN NaN 105.0 10.46 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 3.000000e+00 1.0 5.397605e-79 2.0 2.0 2.000000e+00 1.0 1.0
97 84321 0 6.05 5.16 2.0 2.0 NaN NaN 2.0 62.0 1.0 4.0 8.0 2555.0 290.30 117.35 112.77 18.7 5.397605e-79 28.7 0.26 0.58 0.34 4.25 109.0 24.62 NaN NaN NaN NaN NaN NaN 4350.0 5830.0 53.0 94.0 87.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 1.000000e+00 2.0 2.000000e+00 2.0 1.0 1.000000e+00 1.0 1.0
98 84324 0 6.77 0.00 2.0 2.0 NaN NaN 1.0 59.0 3.0 2.0 4.0 5980.0 1222.34 980.92 85.19 10.1 5.397605e-79 27.3 1.53 1.59 0.28 8.15 122.0 11.78 NaN NaN NaN NaN NaN NaN 16800.0 27600.0 1140.0 2080.0 335.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 4.000000e+00 1.0 NaN 2.0 2.0 2.000000e+00 1.0 2.0
99 84326 0 4.77 6.16 NaN NaN NaN NaN 1.0 13.0 3.0 NaN 15.0 1221.0 157.54 86.09 52.04 6.1 4.200000e+02 16.8 NaN NaN NaN NaN 86.0 4.17 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 8.000000e+00 NaN NaN NaN NaN 5.397605e-79 2.0 1.0
InĀ [30]:
# removing DID060 because it is all NaN values
print(nhanes_merged["DID060"].isnull().all())
nhanes_merged = nhanes_merged.drop('DID060', axis='columns')

# removing BPQ020, BPQ080, & RHD143 because it is all no values
nhanes_merged = nhanes_merged.drop('BPQ020', axis='columns')
nhanes_merged = nhanes_merged.drop('BPQ080', axis='columns')
nhanes_merged = nhanes_merged.drop('RHD143', axis='columns')
True
InĀ [33]:
nhanes_merged.dtypes
Out[33]:
SEQN          int64
diabetes      int64
LBDGLUSI    float64
LBDGLTSI    float64
RIAGENDR    float64
RIDAGEYR    float64
RIDRETH1    float64
DMDEDUC2    float64
INDHHIN2    float64
DR1TKCAL    float64
DR1TCARB    float64
DR1TSUGR    float64
DR1TTFAT    float64
DR1TFIBE    float64
DR1_320Z    float64
BMXBMI      float64
LBXBCD      float64
LBXBPB      float64
LBXTHG      float64
URXUAS      float64
LBXGLU      float64
LBXIN       float64
LBXMPAH     float64
LBXPFDO     float64
LBXPFNA     float64
LBXPFHS     float64
LBXPFDE     float64
LBXPFUA     float64
URXP01      float64
URXP02      float64
URXP03      float64
URXP04      float64
URXP06      float64
URXBPH      float64
URXTRS      float64
URXBP3      float64
URXBUP      float64
URXEPB      float64
URXMPB      float64
URXPPB      float64
URXCNP      float64
URXCOP      float64
URXECP      float64
URXMBP      float64
URXMC1      float64
URXMEP      float64
URXMHH      float64
URXMHP      float64
URXMIB      float64
URXMNP      float64
URXMOH      float64
URXMZP      float64
ALQ130      float64
PAQ710      float64
SMQ020      float64
DPQ010      float64
SLQ050      float64
MCQ300C     float64
HUQ051      float64
HIQ011      float64
HOQ065      float64
dtype: object
InĀ [35]:
nhanes_merged.shape
Out[35]:
(10835, 61)

10835 rows or entries by 86 different columns or variables

InĀ [38]:
nhanes_merged['SEQN'].duplicated().any()
Out[38]:
False

No duplicate patients

Data Quality Discussion¶

I would say that overall, creating the dataset was very simple and straightforward. The only tedious task was having to download all of the datasets and importing them. Of course, there were some instances where the names of the variables changed from year to year like for income and # hours watch TV or videos past 30 days but they were easy to look up in the respective nhanes data dictionary.

InĀ [42]:
# converting categorical variables into binary & converting Refused and Don't Know entries into NaN
## gender RIAGENDR 0 = Male, 1 = Female
nhanes_merged["RIAGENDR"] = nhanes_merged["RIAGENDR"].replace(7, np.nan) ### check demographics
nhanes_merged["RIAGENDR"] = nhanes_merged["RIAGENDR"].replace(9, np.nan)
nhanes_merged["RIAGENDR"] = nhanes_merged["RIAGENDR"] - 1

## race/ethnicity RIDRETH1 0 = Non-Hispanic White, 1 = Other
nhanes_merged["RIAGENDR"] = nhanes_merged["RIAGENDR"].replace(7, np.nan) ### check demographics
nhanes_merged["RIAGENDR"] = nhanes_merged["RIAGENDR"].replace(9, np.nan)
nan_RIDRETH1 = nhanes_merged["RIDRETH1"].isna()
nhanes_merged["RIDRETH1"] = pd.get_dummies(nhanes_merged["RIDRETH1"], dtype=int)[3]
nhanes_merged["RIDRETH1"][nan_RIDRETH1] = np.nan

## education DMDEDUC2 0 = Up to GED/High School Diploma, 1 = Higher Education
nhanes_merged["DMDEDUC2"] = nhanes_merged["DMDEDUC2"].replace(7, np.nan)
nhanes_merged["DMDEDUC2"] = nhanes_merged["DMDEDUC2"].replace(9, np.nan)
nhanes_merged["DMDEDUC2"] = nhanes_merged["DMDEDUC2"].replace(4, 5)
nan_DMDEDUC2 = nhanes_merged["DMDEDUC2"].isna()
nhanes_merged["DMDEDUC2"] = pd.get_dummies(nhanes_merged["DMDEDUC2"], dtype=int)[5]
nhanes_merged["DMDEDUC2"][nan_DMDEDUC2] = np.nan

## annual household income INDHHIN2 0 = Under $75,000, 1 = $75,000 or higher
nhanes_merged["INDHHIN2"] = nhanes_merged["INDHHIN2"].replace(77, np.nan)
nhanes_merged["INDHHIN2"] = nhanes_merged["INDHHIN2"].replace(99, np.nan)
nan_INDHHIN2 = nhanes_merged["INDHHIN2"].isna()
nhanes_merged["INDHHIN2"] = pd.get_dummies(nhanes_merged["INDHHIN2"], dtype=int)[11]
nhanes_merged["INDHHIN2"][nan_INDHHIN2] = np.nan

## interest/motivation DPQ010 0 = Less then half the days, 1 = More than half the days
nhanes_merged["DPQ010"] = nhanes_merged["DPQ010"].replace(7, np.nan)
nhanes_merged["DPQ010"] = nhanes_merged["DPQ010"].replace(9, np.nan)
nhanes_merged["DPQ010"] = nhanes_merged["DPQ010"].replace(3, 2)
nan_DPQ010 = nhanes_merged["DPQ010"].isna()
nhanes_merged["DPQ010"] = pd.get_dummies(nhanes_merged["DPQ010"], dtype=int)[2]
nhanes_merged["DPQ010"][nan_DPQ010] = np.nan

## number of hours watched TV or videos a day PAQ710 0 = 2 hours or less, 1 = 3 hours or more
nhanes_merged["PAQ710"] = nhanes_merged["PAQ710"].replace(77, np.nan)
nhanes_merged["PAQ710"] = nhanes_merged["PAQ710"].replace(99, np.nan)
nhanes_merged["PAQ710"] = nhanes_merged["PAQ710"].replace(4, 3)
nhanes_merged["PAQ710"] = nhanes_merged["PAQ710"].replace(5, 3)
nan_PAQ710 = nhanes_merged["PAQ710"].isna()
nhanes_merged["PAQ710"] = pd.get_dummies(nhanes_merged["PAQ710"], dtype=int)[3]
nhanes_merged["PAQ710"][nan_PAQ710] = np.nan

## number of healthcare visits in past year HUQ051 0 = At Least Once, 1 = None
nhanes_merged["HUQ051"] = nhanes_merged["HUQ051"].replace(77, np.nan)
nhanes_merged["HUQ051"] = nhanes_merged["HUQ051"].replace(99, np.nan)
nan_HUQ051 = nhanes_merged["HUQ051"].isna()
nhanes_merged["HUQ051"] = pd.get_dummies(nhanes_merged["HUQ051"], dtype=int).iloc[:,0]
nhanes_merged["HUQ051"][nan_HUQ051] = np.nan

## home ownership HOQ065 0 = Not Own, 1 = Owned
nhanes_merged["HOQ065"] = nhanes_merged["HOQ065"].replace(7, np.nan)
nhanes_merged["HOQ065"] = nhanes_merged["HOQ065"].replace(9, np.nan)
nan_HOQ065 = nhanes_merged["HOQ065"].isna()
nhanes_merged["HOQ065"] = pd.get_dummies(nhanes_merged["HOQ065"], dtype=int)[1]
nhanes_merged["HOQ065"][nan_HOQ065] = np.nan

## smoked at least 100 cigarettes in life SMQ020 0 = Yes, 1 = No
nhanes_merged["SMQ020"] = nhanes_merged["SMQ020"].replace(7, np.nan)
nhanes_merged["SMQ020"] = nhanes_merged["SMQ020"].replace(9, np.nan)
nhanes_merged["SMQ020"] = nhanes_merged["SMQ020"] - 1

## family history of diabetes MCQ300C 0 = Yes, 1 = No
nhanes_merged["MCQ300C"] = nhanes_merged["MCQ300C"].replace(7, np.nan)
nhanes_merged["MCQ300C"] = nhanes_merged["MCQ300C"].replace(9, np.nan)
nhanes_merged["MCQ300C"] = nhanes_merged["MCQ300C"] - 1

## health insurance status HIQ011 0 = Yes, 1 = No
nhanes_merged["HIQ011"] = nhanes_merged["HIQ011"].replace(7, np.nan)
nhanes_merged["HIQ011"] = nhanes_merged["HIQ011"].replace(9, np.nan)
nhanes_merged["HIQ011"] = nhanes_merged["HIQ011"] - 1

## ever told doctor had trouble sleeping? SLQ050 0 = Yes, 1 = No
nhanes_merged["SLQ050"] = nhanes_merged["SLQ050"].replace(7, np.nan)
nhanes_merged["SLQ050"] = nhanes_merged["SLQ050"].replace(9, np.nan)
nhanes_merged["SLQ050"] = nhanes_merged["SLQ050"] - 1
InĀ [44]:
# continuous converting Refused and Don't Know entries into NaN
## ALQ130
nhanes_merged["ALQ130"] = nhanes_merged["ALQ130"].replace(777, np.nan)
nhanes_merged["ALQ130"] = nhanes_merged["ALQ130"].replace(999, np.nan)

80/20 Split¶

InĀ [47]:
train_data, test_data = train_test_split(nhanes_merged, test_size = 0.2, random_state = 78)
train_data = train_data.reset_index(drop=True)
test_data = test_data.reset_index(drop=True)

Univariate Statistics¶

InĀ [50]:
categorical_nhanes = ["RIAGENDR", "RIDRETH1", "DMDEDUC2", "INDHHIN2", "DPQ010", "PAQ710", "HUQ051", "HOQ065", "SMQ020", 
                      "diabetes", "MCQ300C", "HIQ011", "SLQ050"]
continuous_nhanes = ["RIDAGEYR", "BMXBMI", "ALQ130", "DR1TKCAL", "DR1TCARB", "DR1TSUGR", "DR1TTFAT", "LBXIN", "LBXGLU", 
                     "LBDGLUSI", "LBXBCD", "LBXBPB", "LBXTHG", "URXUAS", "LBXMPAH", "LBXPFDO", "LBXPFNA", "LBXPFHS", "LBXPFDE", "LBXPFUA", "URXBPH", "URXTRS", 
                     "URXBP3", "URXBUP", "URXEPB", "URXMPB", "URXPPB", "URXCNP", "URXCOP", "URXECP", "URXMBP", "URXMC1", 
                     "URXMEP", "URXMHH", "URXMHP", "URXMIB", "URXMNP", "URXMOH", "URXMZP", "URXP01", "URXP02", "URXP03",
                     "URXP04", "URXP06", "DR1TFIBE", "DR1_320Z"]
InĀ [52]:
# value counts for categorical/binary variables
for col in categorical_nhanes:
    print(train_data[col].value_counts())
RIAGENDR
0.0    4360
1.0    4308
Name: count, dtype: int64
RIDRETH1
0    5514
1    3154
Name: count, dtype: int64
DMDEDUC2
1.0    3108
0.0    2707
Name: count, dtype: int64
INDHHIN2
0.0    7933
1.0     330
Name: count, dtype: int64
DPQ010
0.0    5452
1.0     361
Name: count, dtype: int64
PAQ710
0.0    3534
1.0    2030
Name: count, dtype: int64
HUQ051
0.0    6688
1.0    1973
Name: count, dtype: int64
HOQ065
1.0    4720
0.0    3837
Name: count, dtype: int64
SMQ020
1.0    3584
0.0    2432
Name: count, dtype: int64
diabetes
0    8220
1     448
Name: count, dtype: int64
MCQ300C
1.0    3667
0.0    2034
Name: count, dtype: int64
HIQ011
0.0    6341
1.0    2304
Name: count, dtype: int64
SLQ050
1.0    6029
0.0    1156
Name: count, dtype: int64
InĀ [54]:
# summary statistics for continuous variables
for col in continuous_nhanes:
    print(train_data[col].describe())
count    8668.000000
mean       32.762344
std        17.916986
min        12.000000
25%        17.000000
50%        28.000000
75%        44.000000
max        85.000000
Name: RIDAGEYR, dtype: float64
count    8593.000000
mean       26.435478
std         6.515241
min        13.400000
25%        21.700000
50%        25.390000
75%        29.800000
max        68.600000
Name: BMXBMI, dtype: float64
count    3946.000000
mean        3.149265
std         3.096631
min         1.000000
25%         1.000000
50%         2.000000
75%         4.000000
max        82.000000
Name: ALQ130, dtype: float64
count     8212.000000
mean      2165.899903
std       1014.797245
min         93.000000
25%       1472.000000
50%       1994.000000
75%       2666.000000
max      12823.000000
Name: DR1TKCAL, dtype: float64
count    8212.000000
mean      268.898048
std       132.693628
min         0.030000
25%       178.500000
50%       248.210000
75%       332.420000
max      1670.260000
Name: DR1TCARB, dtype: float64
count    8.212000e+03
mean     1.217162e+02
std      8.204480e+01
min      5.397605e-79
25%      6.692000e+01
50%      1.055800e+02
75%      1.569600e+02
max      1.022980e+03
Name: DR1TSUGR, dtype: float64
count    8212.000000
mean       81.405588
std        46.647162
min         0.070000
25%        49.157500
50%        72.990000
75%       103.685000
max       601.330000
Name: DR1TTFAT, dtype: float64
count    8463.000000
mean       12.664783
std        11.481507
min         0.140000
25%         6.090000
50%         9.410000
75%        15.085000
max       136.790000
Name: LBXIN, dtype: float64
count    8668.000000
mean       99.255768
std        23.215104
min        55.000000
25%        90.000000
50%        96.000000
75%       102.000000
max       421.000000
Name: LBXGLU, dtype: float64
count    8668.000000
mean        5.509531
std         1.288871
min         3.053000
25%         4.996000
50%         5.329000
75%         5.662000
max        23.370000
Name: LBDGLUSI, dtype: float64
count    7333.000000
mean        0.430792
std         0.545120
min         0.070000
25%         0.140000
50%         0.250000
75%         0.460000
max         8.800000
Name: LBXBCD, dtype: float64
count    7333.000000
mean        1.325093
std         1.532151
min         0.050000
25%         0.610000
50%         0.950000
75%         1.540000
max        55.200000
Name: LBXBPB, dtype: float64
count    7333.000000
mean        1.277849
std         1.924049
min         0.110000
25%         0.360000
50%         0.680000
75%         1.380000
max        38.500000
Name: LBXTHG, dtype: float64
count    2856.000000
mean       18.592031
std        48.016661
min         0.260000
25%         3.850000
50%         7.100000
75%        15.250000
max      1195.000000
Name: URXUAS, dtype: float64
count    2834.000000
mean        0.289287
std         0.458981
min         0.060000
25%         0.070000
50%         0.160000
75%         0.300000
max        12.200000
Name: LBXMPAH, dtype: float64
count    2834.000000
mean        0.103617
std         0.074885
min         0.070000
25%         0.070000
50%         0.070000
75%         0.140000
max         2.500000
Name: LBXPFDO, dtype: float64
count    2834.000000
mean        1.079670
std         1.036864
min         0.058000
25%         0.550000
50%         0.820000
75%         1.307500
max        25.748000
Name: LBXPFNA, dtype: float64
count    2834.000000
mean        2.072068
std         2.564160
min         0.070000
25%         0.700000
50%         1.300000
75%         2.400000
max        25.600000
Name: LBXPFHS, dtype: float64
count    2834.000000
mean        0.321161
std         0.562450
min         0.070000
25%         0.140000
50%         0.200000
75%         0.380000
max        17.800000
Name: LBXPFDE, dtype: float64
count    2834.000000
mean        0.205953
std         0.309372
min         0.070000
25%         0.070000
50%         0.140000
75%         0.200000
max         6.300000
Name: LBXPFUA, dtype: float64
count    2805.000000
mean        3.289783
std        19.163532
min         0.140000
25%         0.800000
50%         1.700000
75%         3.100000
max       965.000000
Name: URXBPH, dtype: float64
count    2805.000000
mean       98.679540
std       280.416057
min         1.200000
25%         1.900000
50%         8.000000
75%        40.000000
max      2779.700000
Name: URXTRS, dtype: float64
count     2805.000000
mean       210.584877
std       1135.339327
min          0.280000
25%          4.900000
50%         14.100000
75%         55.200000
max      27200.000000
Name: URXBP3, dtype: float64
count    2805.000000
mean        3.067565
std        16.816298
min         0.070000
25%         0.100000
50%         0.140000
75%         0.400000
max       493.000000
Name: URXBUP, dtype: float64
count    2805.000000
mean       20.726307
std        89.432085
min         0.710000
25%         0.710000
50%         0.710000
75%         5.200000
max      1670.000000
Name: URXEPB, dtype: float64
count     2805.000000
mean       243.073144
std        588.042226
min          0.710000
25%         12.800000
50%         55.000000
75%        222.000000
max      12700.000000
Name: URXMPB, dtype: float64
count    2805.000000
mean       57.291094
std       154.411930
min         0.070000
25%         1.100000
50%         6.700000
75%        42.900000
max      2650.000000
Name: URXPPB, dtype: float64
count    2805.000000
mean        5.498061
std        18.922592
min         0.140000
25%         1.200000
50%         2.400000
75%         4.620000
max       730.250000
Name: URXCNP, dtype: float64
count    2805.000000
mean       35.131515
std        83.125018
min         0.210000
25%         4.100000
50%         9.500000
75%        25.800000
max       979.800000
Name: URXCOP, dtype: float64
count    2805.000000
mean       49.044674
std       139.269138
min         0.140000
25%         8.200000
50%        17.100000
75%        39.100000
max      3252.400000
Name: URXECP, dtype: float64
count    2805.000000
mean       23.856873
std        31.452609
min         0.280000
25%         7.300000
50%        15.000000
75%        29.200000
max       549.100000
Name: URXMBP, dtype: float64
count    2805.000000
mean        5.810806
std        19.565722
min         0.140000
25%         1.000000
50%         2.160000
75%         4.900000
max       564.000000
Name: URXMC1, dtype: float64
count     2805.000000
mean       228.292381
std        888.435109
min          0.373400
25%         22.000000
50%         57.222000
75%        163.548000
max      31660.000000
Name: URXMEP, dtype: float64
count    2805.000000
mean       35.862503
std       121.366228
min         0.140000
25%         4.500000
50%        11.100000
75%        24.400000
max      2622.400000
Name: URXMHH, dtype: float64
count    2805.000000
mean        4.724720
std        14.180914
min         0.350000
25%         0.780000
50%         1.500000
75%         3.600000
max       296.010000
Name: URXMHP, dtype: float64
count    2805.000000
mean       14.216086
std        21.582126
min         0.140000
25%         4.300000
50%         8.800000
75%        17.500000
max       600.360000
Name: URXMIB, dtype: float64
count    2805.000000
mean        2.438085
std         7.667234
min         0.350000
25%         0.640000
50%         0.871200
75%         1.100000
max       185.190000
Name: URXMNP, dtype: float64
count    2805.000000
mean       21.401697
std        72.577524
min         0.140000
25%         3.030000
50%         7.000000
75%        15.500000
max      1815.700000
Name: URXMOH, dtype: float64
count    2805.000000
mean       14.553603
std        25.535677
min         0.150000
25%         2.900000
50%         7.200000
75%        16.680000
max       432.340000
Name: URXMZP, dtype: float64
count    2.712000e+03
mean     8.922565e+03
std      1.184863e+05
min      3.390000e+01
25%      6.048250e+02
50%      1.420000e+03
75%      4.590900e+03
max      5.259896e+06
Name: URXP01, dtype: float64
count     2734.000000
mean      8378.445830
std      10927.613086
min        107.000000
25%       2099.550000
50%       4388.000000
75%       9993.250000
max      99149.800000
Name: URXP02, dtype: float64
count    2737.000000
mean      286.377019
std       579.517736
min         5.700000
25%        43.000000
50%        89.000000
75%       220.000000
max      9800.000000
Name: URXP03, dtype: float64
count     2746.00000
mean       515.64177
std        911.35088
min          5.70000
25%        109.85000
50%        217.50000
75%        490.15000
max      11000.00000
Name: URXP04, dtype: float64
count    2756.000000
mean      186.918541
std       247.883314
min         6.400000
25%        64.000000
50%       119.000000
75%       221.000000
max      5864.000000
Name: URXP06, dtype: float64
count    8.212000e+03
mean     1.585567e+01
std      1.013999e+01
min      5.397605e-79
25%      9.000000e+00
50%      1.370000e+01
75%      2.020000e+01
max      1.476000e+02
Name: DR1TFIBE, dtype: float64
count    8.212000e+03
mean     9.473273e+02
std      1.078099e+03
min      5.397605e-79
25%      8.888000e+01
50%      5.925000e+02
75%      1.422000e+03
max      1.344000e+04
Name: DR1_320Z, dtype: float64
InĀ [56]:
# skewness for continuous variables
for col in continuous_nhanes:
    print(col, train_data[col].skew())
RIDAGEYR 0.8672621568446248
BMXBMI 1.1462643133680315
ALQ130 6.4381175237368335
DR1TKCAL 1.4356755307383746
DR1TCARB 1.607732347529472
DR1TSUGR 2.2577637335569793
DR1TTFAT 1.672442042483449
LBXIN 3.5652171420709124
LBXGLU 6.8029425992903105
LBDGLUSI 6.8038321787185945
LBXBCD 4.5048509935695185
LBXBPB 11.47756096435242
LBXTHG 5.698266037863675
URXUAS 11.727396509827233
LBXMPAH 11.058890832571905
LBXPFDO 15.010882440719659
LBXPFNA 8.517421532102865
LBXPFHS 3.988378698636329
LBXPFDE 15.696840664557849
LBXPFUA 7.702216337139375
URXBPH 45.572501663930346
URXTRS 4.873152170159706
URXBP3 14.199414991670123
URXBUP 15.548290772509839
URXEPB 8.966710675323327
URXMPB 7.909891162993048
URXPPB 6.548659119693288
URXCNP 23.390371231747434
URXCOP 5.599068661042671
URXECP 11.183242086545858
URXMBP 5.954095388975746
URXMC1 15.623827047208819
URXMEP 19.943051710448454
URXMHH 11.931674405382374
URXMHP 10.213954921191291
URXMIB 11.149177383117541
URXMNP 10.48701049415024
URXMOH 13.089600441871058
URXMZP 7.1876636146873825
URXP01 38.41994472859625
URXP02 3.220380836524587
URXP03 5.347160513074789
URXP04 5.0060403203933
URXP06 7.7526891624003165
DR1TFIBE 1.9555736383641864
DR1_320Z 2.034475117803634
InĀ [58]:
# testing skewness with log transformations
for col in continuous_nhanes:
    print(col, np.log(train_data[col]).skew())
RIDAGEYR 0.11878386900215072
BMXBMI 0.3825506192962267
ALQ130 0.49048167778687485
DR1TKCAL -0.4839475885237581
DR1TCARB -1.203029953989769
DR1TSUGR -75.62626974189726
DR1TTFAT -1.044074825925181
LBXIN 0.12463739036539195
LBXGLU 3.4480394895575137
LBDGLUSI 3.4487586871975653
LBXBCD 0.8418863463720669
LBXBPB 0.42568039164064253
LBXTHG 0.5142849970205279
URXUAS 0.5928336027292153
LBXMPAH 0.6515821205470501
LBXPFDO 1.3490181097460523
LBXPFNA -0.15900846087694975
LBXPFHS -0.22484170406839224
LBXPFDE 0.642921609003418
LBXPFUA 1.252271600441551
URXBPH 0.23254414913906832
URXTRS 0.7550411702003581
URXBP3 0.5685894611836925
URXBUP 1.7159076836901581
URXEPB 1.4328065867310291
URXMPB 0.010052198016668214
URXPPB 0.08541909117296817
URXCNP 0.39736251358159236
URXCOP 0.4020180776696439
URXECP 0.3455273049645498
URXMBP -0.6468760047480185
URXMC1 0.3328804634889151
URXMEP 0.30809018302531327
URXMHH 0.3666079920511518
URXMHP 0.9099345315127817
URXMIB -0.41426540504478615
URXMNP 2.036429286931458
URXMOH 0.3504792899416325
URXMZP -0.2650781562248332
URXP01 0.5576786277571182
URXP02 -0.0209716001833414
URXP03 0.5858877015847936
URXP04 0.42857292174379424
URXP06 0.07125463495949712
DR1TFIBE -33.428860277892866
DR1_320Z -1.2033472414989406
InĀ [60]:
# missing data
train_data.isnull().sum()/len(train_data)*100
Out[60]:
SEQN         0.000000
diabetes     0.000000
LBDGLUSI     0.000000
LBDGLTSI     0.000000
RIAGENDR     0.000000
RIDAGEYR     0.000000
RIDRETH1     0.000000
DMDEDUC2    32.914167
INDHHIN2     4.672358
DR1TKCAL     5.260729
DR1TCARB     5.260729
DR1TSUGR     5.260729
DR1TTFAT     5.260729
DR1TFIBE     5.260729
DR1_320Z     5.260729
BMXBMI       0.865251
LBXBCD      15.401477
LBXBPB      15.401477
LBXTHG      15.401477
URXUAS      67.051223
LBXGLU       0.000000
LBXIN        2.365021
LBXMPAH     67.305030
LBXPFDO     67.305030
LBXPFNA     67.305030
LBXPFHS     67.305030
LBXPFDE     67.305030
LBXPFUA     67.305030
URXP01      68.712506
URXP02      68.458699
URXP03      68.424089
URXP04      68.320258
URXP06      68.204892
URXBPH      67.639594
URXTRS      67.639594
URXBP3      67.639594
URXBUP      67.639594
URXEPB      67.639594
URXMPB      67.639594
URXPPB      67.639594
URXCNP      67.639594
URXCOP      67.639594
URXECP      67.639594
URXMBP      67.639594
URXMC1      67.639594
URXMEP      67.639594
URXMHH      67.639594
URXMHP      67.639594
URXMIB      67.639594
URXMNP      67.639594
URXMOH      67.639594
URXMZP      67.639594
ALQ130      54.476234
PAQ710      35.809875
SMQ020      30.595293
DPQ010      32.937240
SLQ050      17.108906
MCQ300C     34.229349
HUQ051       0.080757
HIQ011       0.265344
HOQ065       1.280572
dtype: float64
InĀ [62]:
# missing data percentage
train_data.isnull().sum()/len(train_data)*100
Out[62]:
SEQN         0.000000
diabetes     0.000000
LBDGLUSI     0.000000
LBDGLTSI     0.000000
RIAGENDR     0.000000
RIDAGEYR     0.000000
RIDRETH1     0.000000
DMDEDUC2    32.914167
INDHHIN2     4.672358
DR1TKCAL     5.260729
DR1TCARB     5.260729
DR1TSUGR     5.260729
DR1TTFAT     5.260729
DR1TFIBE     5.260729
DR1_320Z     5.260729
BMXBMI       0.865251
LBXBCD      15.401477
LBXBPB      15.401477
LBXTHG      15.401477
URXUAS      67.051223
LBXGLU       0.000000
LBXIN        2.365021
LBXMPAH     67.305030
LBXPFDO     67.305030
LBXPFNA     67.305030
LBXPFHS     67.305030
LBXPFDE     67.305030
LBXPFUA     67.305030
URXP01      68.712506
URXP02      68.458699
URXP03      68.424089
URXP04      68.320258
URXP06      68.204892
URXBPH      67.639594
URXTRS      67.639594
URXBP3      67.639594
URXBUP      67.639594
URXEPB      67.639594
URXMPB      67.639594
URXPPB      67.639594
URXCNP      67.639594
URXCOP      67.639594
URXECP      67.639594
URXMBP      67.639594
URXMC1      67.639594
URXMEP      67.639594
URXMHH      67.639594
URXMHP      67.639594
URXMIB      67.639594
URXMNP      67.639594
URXMOH      67.639594
URXMZP      67.639594
ALQ130      54.476234
PAQ710      35.809875
SMQ020      30.595293
DPQ010      32.937240
SLQ050      17.108906
MCQ300C     34.229349
HUQ051       0.080757
HIQ011       0.265344
HOQ065       1.280572
dtype: float64

Univariate Visualizations¶

InĀ [65]:
# boxplots for continuous variables
for col in continuous_nhanes:
    plt.boxplot(train_data[col].dropna())
    plt.ylabel('Frequency')
    plt.xlabel(col)
    plt.show()
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
InĀ [66]:
# histograms for continuous variables
for col in continuous_nhanes:
    plt.hist(train_data[col], color='skyblue', edgecolor='black')
    plt.ylabel('Frequency')
    plt.xlabel(col)
    plt.show()
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
InĀ [67]:
# bar charts for categorical variables
for col in categorical_nhanes:
    ax = train_data[col].value_counts().plot(kind = 'bar')
    ax.set_ylabel("Frequency")
    ax.set_xlabel(col)
    plt.show()
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image

Discussion for Univariate Visualizations¶

All of the continuous variables seem to be right skewed with many outliers. When looking at the outliers in the boxplots and comparing them to the histograms, they seem to make sense because the bulk of the data seems to have lower values but there are sizeable counts of individuals with larger values that decrease the larger the values get. Applying a log transformation to the all of the continuous data (all of which are skewed) decreases the skewness for most of them. The only instance where the skew gets worse is with DR1TFIBE where the skew goes from a small positive value to a large negative value after the log transformation.

Log Transformations¶

InĀ [72]:
continuous_nhanes_log = ["RIDAGEYR", "BMXBMI", "ALQ130", "DR1TKCAL", "DR1TCARB", "DR1TSUGR", "DR1TTFAT", "LBXIN", "LBXGLU", 
                     "LBDGLUSI", "LBXBCD", "LBXBPB", "LBXTHG", "URXUAS", "LBXMPAH", "LBXPFDO", "LBXPFNA", "LBXPFHS", "LBXPFDE", "LBXPFUA", "URXBPH", "URXTRS", 
                     "URXBP3", "URXBUP", "URXEPB", "URXMPB", "URXPPB", "URXCNP", "URXCOP", "URXECP", "URXMBP", "URXMC1", 
                     "URXMEP", "URXMHH", "URXMHP", "URXMIB", "URXMNP", "URXMOH", "URXMZP", "URXP01", "URXP02", "URXP03",
                     "URXP04", "URXP06", "DR1_320Z"]
for col in continuous_nhanes_log:
    train_data[col] = np.log(train_data[col])
    test_data[col] = np.log(test_data[col])

Imputation¶

InĀ [75]:
# mode
def impute_mode(df, column_name):
  mode_value = df[column_name].mode()[0]
  df[column_name].fillna(mode_value, inplace=True)
  return df

# median
def impute_median(df, column_name):
  median_value = df[column_name].median()
  df[column_name].fillna(median_value, inplace=True)
  return df

# Initialize MICE imputer
imputer = IterativeImputer(max_iter = 10, random_state = 78)
InĀ [77]:
# Demographic Variables (Education, Income) -- Mode
train_data = impute_mode(train_data, "DMDEDUC2")
train_data = impute_mode(train_data, "INDHHIN2")

# Health Variables (BMI, Alcohol, Sleep) -- Median
train_data = impute_median(train_data, "BMXBMI")
train_data = impute_median(train_data, "ALQ130")
train_data = impute_median(train_data, "SLQ050")

# Dietary Variables (Calories, Carbs, Sugar, Fat, Fiber, Water) -- Multivariate Imputation by Chained Equations (MICE)
dietary_imputed = pd.DataFrame(imputer.fit_transform(train_data[["DR1TKCAL", "DR1TCARB", "DR1TSUGR", "DR1TTFAT", "DR1TFIBE", "DR1_320Z"]]), columns=["DR1TKCAL", "DR1TCARB", "DR1TSUGR", "DR1TTFAT", "DR1TFIBE", "DR1_320Z"])
train_data[["DR1TKCAL", "DR1TCARB", "DR1TSUGR", "DR1TTFAT", "DR1TFIBE", "DR1_320Z"]] = dietary_imputed

# Heavy Metals (Cadmium, Mercury, Lead, Arsenic) -- Multivariate Imputation by Chained Equations (MICE)/median
metals_imputed = pd.DataFrame(imputer.fit_transform(train_data[["LBXBCD", "LBXBPB", "LBXTHG"]]), columns=["LBXBCD", "LBXBPB", "LBXTHG"])
train_data[["LBXBCD", "LBXBPB", "LBXTHG"]] = metals_imputed
train_data = impute_median(train_data, "URXUAS") # keep same

# Glucose -- Median
train_data = impute_median(train_data, "LBXIN")

# PFAs -- Multivariate Imputation by Chained Equations (MICE)
pfa_imputed = pd.DataFrame(imputer.fit_transform(train_data[["LBXMPAH", "LBXPFDO", "LBXPFNA", "LBXPFHS", "LBXPFDE", "LBXPFUA"]]), columns=["LBXMPAH", "LBXPFDO", "LBXPFNA", "LBXPFHS", "LBXPFDE", "LBXPFUA"])
train_data[["LBXMPAH", "LBXPFDO", "LBXPFNA", "LBXPFHS", "LBXPFDE", "LBXPFUA"]] = pfa_imputed

# PAHs -- Multivariate Imputation by Chained Equations (MICE)
pah_imputed = pd.DataFrame(imputer.fit_transform(train_data[["URXP01", "URXP02", "URXP03", "URXP04", "URXP06"]]), columns=["URXP01", "URXP02", "URXP03", "URXP04", "URXP06"])
train_data[["URXP01", "URXP02", "URXP03", "URXP04", "URXP06"]] = pah_imputed

# Environmental Phthalates -- Multivariate Imputation by Chained Equations (MICE)
eph_imputed = pd.DataFrame(imputer.fit_transform(train_data[["URXBPH", "URXTRS", "URXBP3", "URXBUP", "URXEPB", "URXMPB", "URXPPB"]]), columns=["URXBPH", "URXTRS", "URXBP3", "URXBUP", "URXEPB", "URXMPB", "URXPPB"])
train_data[["URXBPH", "URXTRS", "URXBP3", "URXBUP", "URXEPB", "URXMPB", "URXPPB"]] = eph_imputed

# Phthalates -- Multivariate Imputation by Chained Equations (MICE)
phthalates_imputed = pd.DataFrame(imputer.fit_transform(train_data[["URXCNP", "URXCOP", "URXECP", "URXMBP", "URXMC1", "URXMEP", "URXMHH", "URXMHP", "URXMIB", "URXMNP", "URXMOH", "URXMZP"]]), columns=["URXCNP", "URXCOP", "URXECP", "URXMBP", "URXMC1", "URXMEP", "URXMHH", "URXMHP", "URXMIB", "URXMNP", "URXMOH", "URXMZP"])
train_data[["URXCNP", "URXCOP", "URXECP", "URXMBP", "URXMC1", "URXMEP", "URXMHH", "URXMHP", "URXMIB", "URXMNP", "URXMOH", "URXMZP"]] = phthalates_imputed

# Behavioral Variables -- Mode
train_data = impute_mode(train_data, "HUQ051")
train_data = impute_mode(train_data, "HIQ011")
train_data = impute_mode(train_data, "HOQ065")
train_data = impute_mode(train_data, "MCQ300C")
train_data = impute_mode(train_data, "PAQ710")
train_data = impute_mode(train_data, "SMQ020")
train_data = impute_mode(train_data, "DPQ010")
InĀ [79]:
# Demographic Variables (Education, Income) -- Mode
test_data = impute_mode(test_data, "DMDEDUC2")
test_data = impute_mode(test_data, "INDHHIN2")

# Health Variables (BMI, Alcohol, Sleep) -- Median
test_data = impute_median(test_data, "BMXBMI")
test_data = impute_median(test_data, "ALQ130")
test_data = impute_median(test_data, "SLQ050")

# Dietary Variables (Calories, Carbs, Sugar, Fat, Fiber, Water) -- Multivariate Imputation by Chained Equations (MICE)
dietary_imputed = pd.DataFrame(imputer.fit_transform(test_data[["DR1TKCAL", "DR1TCARB", "DR1TSUGR", "DR1TTFAT", "DR1TFIBE", "DR1_320Z"]]), columns=["DR1TKCAL", "DR1TCARB", "DR1TSUGR", "DR1TTFAT", "DR1TFIBE", "DR1_320Z"])
test_data[["DR1TKCAL", "DR1TCARB", "DR1TSUGR", "DR1TTFAT", "DR1TFIBE", "DR1_320Z"]] = dietary_imputed

# Heavy Metals (Cadmium, Mercury, Lead, Arsenic) -- Multivariate Imputation by Chained Equations (MICE)/median
metals_imputed = pd.DataFrame(imputer.fit_transform(test_data[["LBXBCD", "LBXBPB", "LBXTHG"]]), columns=["LBXBCD", "LBXBPB", "LBXTHG"])
test_data[["LBXBCD", "LBXBPB", "LBXTHG"]] = metals_imputed
test_data = impute_median(test_data, "URXUAS") # keep same

# Glucose -- Median
test_data = impute_median(test_data, "LBXIN")

# PFAs -- Multivariate Imputation by Chained Equations (MICE)
pfa_imputed = pd.DataFrame(imputer.fit_transform(test_data[["LBXMPAH", "LBXPFDO", "LBXPFNA", "LBXPFHS", "LBXPFDE", "LBXPFUA"]]), columns=["LBXMPAH", "LBXPFDO", "LBXPFNA", "LBXPFHS", "LBXPFDE", "LBXPFUA"])
test_data[["LBXMPAH", "LBXPFDO", "LBXPFNA", "LBXPFHS", "LBXPFDE", "LBXPFUA"]] = pfa_imputed

# PAHs -- Multivariate Imputation by Chained Equations (MICE)
pah_imputed = pd.DataFrame(imputer.fit_transform(test_data[["URXP01", "URXP02", "URXP03", "URXP04", "URXP06"]]), columns=["URXP01", "URXP02", "URXP03", "URXP04", "URXP06"])
test_data[["URXP01", "URXP02", "URXP03", "URXP04", "URXP06"]] = pah_imputed

# Environmental Phthalates -- Multivariate Imputation by Chained Equations (MICE)
eph_imputed = pd.DataFrame(imputer.fit_transform(test_data[["URXBPH", "URXTRS", "URXBP3", "URXBUP", "URXEPB", "URXMPB", "URXPPB"]]), columns=["URXBPH", "URXTRS", "URXBP3", "URXBUP", "URXEPB", "URXMPB", "URXPPB"])
test_data[["URXBPH", "URXTRS", "URXBP3", "URXBUP", "URXEPB", "URXMPB", "URXPPB"]] = eph_imputed

# Phthalates -- Multivariate Imputation by Chained Equations (MICE)
phthalates_imputed = pd.DataFrame(imputer.fit_transform(test_data[["URXCNP", "URXCOP", "URXECP", "URXMBP", "URXMC1", "URXMEP", "URXMHH", "URXMHP", "URXMIB", "URXMNP", "URXMOH", "URXMZP"]]), columns=["URXCNP", "URXCOP", "URXECP", "URXMBP", "URXMC1", "URXMEP", "URXMHH", "URXMHP", "URXMIB", "URXMNP", "URXMOH", "URXMZP"])
test_data[["URXCNP", "URXCOP", "URXECP", "URXMBP", "URXMC1", "URXMEP", "URXMHH", "URXMHP", "URXMIB", "URXMNP", "URXMOH", "URXMZP"]] = phthalates_imputed

# Behavioral Variables -- Mode
test_data = impute_mode(test_data, "HUQ051")
test_data = impute_mode(test_data, "HIQ011")
test_data = impute_mode(test_data, "HOQ065")
test_data = impute_mode(test_data, "MCQ300C")
test_data = impute_mode(test_data, "PAQ710")
test_data = impute_mode(test_data, "SMQ020")
test_data = impute_mode(test_data, "DPQ010")

Exploratory Data Analysis and Bivariate Analysis¶

InĀ [82]:
# correlations
numerical_nhanes = train_data[continuous_nhanes]
numerical_nhanes["diabetes"] = pd.to_numeric(train_data["diabetes"], errors="coerce")
corr = numerical_nhanes.loc[:,~numerical_nhanes.columns.duplicated()].corr()
corr.head(100)
corr.style.background_gradient(cmap='coolwarm')
Out[82]:
Ā  RIDAGEYR BMXBMI ALQ130 DR1TKCAL DR1TCARB DR1TSUGR DR1TTFAT LBXIN LBXGLU LBDGLUSI LBXBCD LBXBPB LBXTHG URXUAS LBXMPAH LBXPFDO LBXPFNA LBXPFHS LBXPFDE LBXPFUA URXBPH URXTRS URXBP3 URXBUP URXEPB URXMPB URXPPB URXCNP URXCOP URXECP URXMBP URXMC1 URXMEP URXMHH URXMHP URXMIB URXMNP URXMOH URXMZP URXP01 URXP02 URXP03 URXP04 URXP06 DR1TFIBE DR1_320Z diabetes
RIDAGEYR 1.000000 0.276835 -0.049195 -0.022535 -0.063251 -0.025177 -0.027491 -0.177180 0.275097 0.275131 0.359195 0.415399 0.280666 0.078363 0.008716 -0.000235 0.071768 0.038425 0.104914 0.114822 -0.052036 0.007028 -0.007265 0.000539 0.050070 0.017795 0.002019 -0.019836 -0.039036 -0.072532 -0.077049 -0.052441 0.008772 -0.071952 -0.057099 -0.078938 -0.037882 -0.084542 -0.127855 0.089793 0.011790 0.020125 0.034065 0.024007 0.110284 0.021556 0.231925
BMXBMI 0.276835 1.000000 0.070655 -0.010689 -0.039629 -0.002268 0.004131 0.465490 0.226896 0.226870 0.024432 0.015792 0.031764 0.016020 -0.057258 -0.028180 -0.022313 -0.022017 -0.034604 -0.050581 0.017345 0.006948 -0.011590 -0.027212 -0.015703 0.013729 0.004688 0.054427 0.046988 0.024656 0.016557 0.022802 0.072135 0.023753 0.011157 0.043477 0.000334 0.020893 0.018343 0.000241 0.096730 0.000286 0.045689 0.057947 -0.003748 0.024218 0.144196
ALQ130 -0.049195 0.070655 1.000000 0.122885 0.070473 0.010235 0.062827 0.007606 0.056448 0.056438 0.097814 0.095320 -0.041142 0.020626 -0.034401 -0.013071 -0.011042 0.016803 -0.025146 -0.034601 0.054806 -0.012328 -0.047695 -0.027835 0.015435 -0.022472 -0.033499 0.016736 0.010988 0.017814 0.030500 0.031569 0.025856 0.025792 0.028521 0.025864 0.023444 0.019869 0.040268 0.087382 0.103168 0.126131 0.118798 0.079881 -0.015875 -0.043760 0.020119
DR1TKCAL -0.022535 -0.010689 0.122885 1.000000 0.877693 0.258928 0.872380 0.000511 0.007269 0.007290 -0.004915 0.034032 0.002365 0.011704 0.015039 0.002527 0.022639 0.048849 0.010015 -0.020095 0.010087 0.019699 -0.018234 -0.038261 -0.014865 -0.039840 -0.055292 0.040792 0.034353 0.054915 -0.010179 0.051504 0.002181 0.061417 0.045509 -0.021754 0.037501 0.054698 0.012892 0.029931 -0.011944 0.034726 0.028441 0.016261 0.534127 -0.043463 -0.027442
DR1TCARB -0.063251 -0.039629 0.070473 0.877693 1.000000 0.450036 0.633181 0.029471 -0.000078 -0.000066 -0.033193 0.001565 -0.044464 -0.015182 0.027896 0.001369 0.012000 0.036902 -0.013819 -0.035910 0.011760 0.015637 -0.025200 -0.027503 -0.038929 -0.037678 -0.052249 0.033731 0.019189 0.053835 -0.007952 0.040538 0.006686 0.051327 0.030530 -0.023047 0.016704 0.048178 0.014769 0.014994 -0.020790 0.011470 0.009867 0.005457 0.550612 -0.064822 -0.030970
DR1TSUGR -0.025177 -0.002268 0.010235 0.258928 0.450036 1.000000 0.175195 0.025237 0.008435 0.008442 -0.021060 -0.013685 -0.025431 -0.010501 0.015692 0.006104 0.004378 0.013645 -0.006975 -0.014593 0.014659 0.004016 -0.007062 -0.001962 -0.014748 -0.002526 -0.007367 0.017578 0.008009 0.024111 0.010608 0.017884 0.009579 0.022351 0.013380 0.001655 0.004944 0.022366 0.014281 0.011504 0.000098 0.009283 0.009833 0.010220 0.118924 -0.048236 -0.011102
DR1TTFAT -0.027491 0.004131 0.062827 0.872380 0.633181 0.175195 1.000000 0.019355 0.007106 0.007119 -0.035198 -0.010152 -0.013558 -0.006299 -0.000635 0.000724 0.003414 0.037473 0.001049 -0.035104 0.007298 0.017551 -0.006594 -0.032553 -0.018610 -0.023084 -0.031622 0.045956 0.042326 0.054523 -0.006636 0.054428 -0.002662 0.062336 0.048650 -0.014188 0.040118 0.059006 0.017889 0.007935 -0.015958 0.014238 0.009136 0.003620 0.418043 -0.011403 -0.023162
LBXIN -0.177180 0.465490 0.007606 0.000511 0.029471 0.025237 0.019355 1.000000 0.211596 0.211561 -0.151057 -0.138398 -0.112682 -0.021270 -0.052404 -0.053862 -0.037252 -0.038886 -0.086738 -0.106293 0.019975 -0.011724 -0.034506 -0.032427 -0.063172 -0.006174 -0.010344 0.032946 0.064482 0.061831 0.049967 0.052259 0.037279 0.062876 0.031826 0.079954 0.040170 0.062996 0.063261 -0.047075 0.045784 -0.041164 -0.006053 0.017406 -0.030956 -0.027378 0.124081
LBXGLU 0.275097 0.226896 0.056448 0.007269 -0.000078 0.008435 0.007106 0.211596 1.000000 0.999994 0.050378 0.144172 0.033099 0.024445 -0.011941 -0.006374 0.011420 0.015452 0.024183 0.002555 -0.036025 -0.022449 -0.068459 -0.047853 -0.060605 -0.046914 -0.066700 -0.009345 -0.000618 -0.004863 -0.012439 -0.004157 0.004544 -0.004699 -0.012199 0.007175 -0.002584 -0.010806 -0.041545 0.017032 0.034159 -0.006506 0.011443 0.003245 0.032780 0.008299 0.652247
LBDGLUSI 0.275131 0.226870 0.056438 0.007290 -0.000066 0.008442 0.007119 0.211561 0.999994 1.000000 0.050424 0.144230 0.033159 0.024522 -0.011908 -0.006326 0.011426 0.015497 0.024244 0.002633 -0.036008 -0.022485 -0.068446 -0.047854 -0.060634 -0.046903 -0.066726 -0.009302 -0.000629 -0.004815 -0.012410 -0.004097 0.004545 -0.004660 -0.012172 0.007154 -0.002558 -0.010765 -0.041558 0.017038 0.034114 -0.006522 0.011431 0.003207 0.032748 0.008312 0.652336
LBXBCD 0.359195 0.024432 0.097814 -0.004915 -0.033193 -0.021060 -0.035198 -0.151057 0.050378 0.050424 1.000000 0.385347 0.127550 0.056038 0.042105 0.016960 0.021396 -0.010610 0.035651 0.059691 -0.004681 -0.026339 -0.035683 0.004770 0.051736 0.003359 0.000491 -0.021199 -0.039583 -0.024228 0.008434 -0.018317 0.004204 -0.020392 -0.003666 -0.018351 -0.017925 -0.024708 -0.021559 0.283455 0.170835 0.287831 0.265966 0.109110 -0.047857 -0.081805 0.068836
LBXBPB 0.415399 0.015792 0.095320 0.034032 0.001565 -0.013685 -0.010152 -0.138398 0.144172 0.144230 0.385347 1.000000 0.199550 0.107696 0.076846 0.092868 0.091027 0.052800 0.099836 0.102385 -0.001403 -0.029658 -0.079913 -0.031903 -0.009271 -0.045575 -0.073508 -0.019247 -0.077987 0.004016 0.004545 -0.031711 0.022656 0.005668 -0.005111 -0.036558 -0.044696 -0.005132 -0.033446 0.168714 0.008869 0.121121 0.118584 0.074794 0.040778 -0.053055 0.081720
LBXTHG 0.280666 0.031764 -0.041142 0.002365 -0.044464 -0.025431 -0.013558 -0.112682 0.033099 0.033159 0.127550 0.199550 1.000000 0.300720 -0.008065 0.042309 0.089770 0.001277 0.151698 0.228828 -0.040945 0.043969 0.056446 0.052046 0.059693 0.024701 0.013835 -0.013142 -0.012859 -0.023710 -0.035459 -0.025321 -0.006683 -0.021957 -0.000913 -0.027209 -0.008042 -0.026370 -0.064066 0.003810 -0.072821 -0.043994 -0.040516 -0.017936 0.073392 0.081617 0.029791
URXUAS 0.078363 0.016020 0.020626 0.011704 -0.015182 -0.010501 -0.006299 -0.021270 0.024445 0.024522 0.056038 0.107696 0.300720 1.000000 0.007184 0.031487 0.049663 -0.002054 0.072972 0.098317 0.036320 0.026710 0.028407 0.019473 0.040041 0.040130 0.024067 0.034547 0.046197 0.039519 0.057045 0.055757 0.021110 0.036763 0.056583 0.051446 0.051441 0.036990 0.031205 0.092305 0.105401 0.098307 0.112249 0.144847 -0.005508 0.006512 0.018170
LBXMPAH 0.008716 -0.057258 -0.034401 0.015039 0.027896 0.015692 -0.000635 -0.052404 -0.011941 -0.011908 0.042105 0.076846 -0.008065 0.007184 1.000000 0.367104 0.273211 0.264158 0.262634 0.171176 0.075008 0.051121 -0.011414 0.080892 0.007470 0.007771 0.001948 0.034075 -0.046931 0.117080 0.094307 0.057998 0.086494 0.110962 0.059949 -0.003407 -0.000381 0.109072 0.078877 0.016038 -0.017730 0.001807 -0.002726 -0.006888 -0.022200 -0.037555 -0.005594
LBXPFDO -0.000235 -0.028180 -0.013071 0.002527 0.001369 0.006104 0.000724 -0.053862 -0.006374 -0.006326 0.016960 0.092868 0.042309 0.031487 0.367104 1.000000 0.240295 0.159096 0.366735 0.361501 0.025458 0.051817 -0.015623 0.081563 -0.011895 0.029106 0.018965 -0.003502 -0.063741 0.082549 0.041606 0.001521 0.047163 0.080600 0.038000 -0.037051 -0.019794 0.079758 0.014725 0.018443 -0.013376 0.008766 0.000830 -0.001308 -0.025001 -0.025179 -0.016019
LBXPFNA 0.071768 -0.022313 -0.011042 0.022639 0.012000 0.004378 0.003414 -0.037252 0.011420 0.011426 0.021396 0.091027 0.089770 0.049663 0.273211 0.240295 1.000000 0.431411 0.711883 0.574719 0.035871 0.048710 0.006984 0.088406 0.004547 0.036437 0.007447 0.014503 -0.029907 0.075620 0.031733 0.036177 0.066441 0.077031 0.044171 -0.042825 -0.016515 0.070293 -0.005844 -0.007564 -0.024268 -0.013178 -0.011752 -0.021259 -0.012025 -0.027074 0.006191
LBXPFHS 0.038425 -0.022017 0.016803 0.048849 0.036902 0.013645 0.037473 -0.038886 0.015452 0.015497 -0.010610 0.052800 0.001277 -0.002054 0.264158 0.159096 0.431411 1.000000 0.287648 0.165070 0.057208 0.024684 0.006031 0.008788 -0.019081 -0.027768 -0.053234 0.055550 0.004775 0.048265 0.017592 0.036828 0.041176 0.053880 0.032293 -0.032771 0.016587 0.045418 0.004436 0.014354 -0.000354 0.010638 0.007438 0.000595 -0.002336 -0.023900 0.005793
LBXPFDE 0.104914 -0.034604 -0.025146 0.010015 -0.013819 -0.006975 0.001049 -0.086738 0.024183 0.024244 0.035651 0.099836 0.151698 0.072972 0.262634 0.366735 0.711883 0.287648 1.000000 0.754365 -0.007550 0.044202 0.007670 0.082564 0.016857 0.027077 0.002849 -0.027274 -0.059262 0.025619 0.004139 -0.007652 0.035889 0.033574 0.023473 -0.058854 -0.015399 0.025236 -0.040253 -0.005773 -0.033340 -0.018106 -0.018499 -0.022704 0.001055 -0.004412 0.022107
LBXPFUA 0.114822 -0.050581 -0.034601 -0.020095 -0.035910 -0.014593 -0.035104 -0.106293 0.002555 0.002633 0.059691 0.102385 0.228828 0.098317 0.171176 0.361501 0.574719 0.165070 0.754365 1.000000 -0.041593 0.049573 0.000575 0.089885 0.033351 0.036968 0.011258 -0.060819 -0.075107 -0.005569 -0.014625 -0.030603 -0.002710 -0.000166 0.013113 -0.069707 -0.029655 -0.006340 -0.066336 -0.007065 -0.043761 -0.018591 -0.019319 -0.023169 -0.000267 0.008976 0.016343
URXBPH -0.052036 0.017345 0.054806 0.010087 0.011760 0.014659 0.007298 0.019975 -0.036025 -0.036008 -0.004681 -0.001403 -0.040945 0.036320 0.075008 0.025458 0.035871 0.057208 -0.007550 -0.041593 1.000000 0.147183 0.124354 0.147506 0.140161 0.227003 0.198849 0.349095 0.267763 0.463889 0.507276 0.409479 0.367465 0.481790 0.400325 0.425609 0.185482 0.490261 0.480276 0.208404 0.233228 0.246652 0.276966 0.292380 -0.053165 -0.071083 -0.018315
URXTRS 0.007028 0.006948 -0.012328 0.019699 0.015637 0.004016 0.017551 -0.011724 -0.022449 -0.022485 -0.026339 -0.029658 0.043969 0.026710 0.051121 0.051817 0.048710 0.024684 0.044202 0.049573 0.147183 1.000000 0.147692 0.176276 0.142249 0.166520 0.154998 0.134445 0.089091 0.209084 0.170628 0.133347 0.200237 0.194528 0.167909 0.105607 0.079227 0.195391 0.115123 0.019495 0.028982 0.014788 0.032948 0.088000 0.013946 0.025922 -0.004257
URXBP3 -0.007265 -0.011590 -0.047695 -0.018234 -0.025200 -0.007062 -0.006594 -0.034506 -0.068459 -0.068446 -0.035683 -0.079913 0.056446 0.028407 -0.011414 -0.015623 0.006984 0.006031 0.007670 0.000575 0.124354 0.147692 1.000000 0.216625 0.259654 0.277838 0.298543 0.173518 0.187773 0.158292 0.181523 0.191800 0.113358 0.139980 0.112130 0.149969 0.101400 0.145904 0.115187 -0.003789 0.040930 -0.017669 0.008097 0.096956 0.015527 0.048153 -0.033809
URXBUP 0.000539 -0.027212 -0.027835 -0.038261 -0.027503 -0.001962 -0.032553 -0.032427 -0.047853 -0.047854 0.004770 -0.031903 0.052046 0.019473 0.080892 0.081563 0.088406 0.008788 0.082564 0.089885 0.147506 0.176276 0.216625 1.000000 0.420127 0.418030 0.445810 0.092791 -0.010443 0.219584 0.196666 0.135580 0.223052 0.217083 0.180310 0.074325 0.043175 0.214734 0.104504 0.031885 0.020255 0.008582 0.034143 0.083219 -0.013645 0.012961 -0.003387
URXEPB 0.050070 -0.015703 0.015435 -0.014865 -0.038929 -0.014748 -0.018610 -0.063172 -0.060605 -0.060634 0.051736 -0.009271 0.059693 0.040041 0.007470 -0.011895 0.004547 -0.019081 0.016857 0.033351 0.140161 0.142249 0.259654 0.420127 1.000000 0.487131 0.481681 0.112136 0.120277 0.109904 0.172112 0.130442 0.176011 0.117050 0.138134 0.154251 0.099813 0.110185 0.091529 0.089756 0.094462 0.085148 0.093562 0.123945 -0.012617 0.012675 -0.026673
URXMPB 0.017795 0.013729 -0.022472 -0.039840 -0.037678 -0.002526 -0.023084 -0.006174 -0.046914 -0.046903 0.003359 -0.045575 0.024701 0.040130 0.007771 0.029106 0.036437 -0.027768 0.027077 0.036968 0.227003 0.166520 0.277838 0.418030 0.487131 1.000000 0.827147 0.172665 0.127276 0.234936 0.308348 0.193652 0.352957 0.238077 0.213889 0.261624 0.103442 0.239427 0.211237 0.077430 0.131640 0.072077 0.094411 0.123903 -0.020207 0.004155 -0.014588
URXPPB 0.002019 0.004688 -0.033499 -0.055292 -0.052249 -0.007367 -0.031622 -0.010344 -0.066700 -0.066726 0.000491 -0.073508 0.013835 0.024067 0.001948 0.018965 0.007447 -0.053234 0.002849 0.011258 0.198849 0.154998 0.298543 0.445810 0.481681 0.827147 1.000000 0.140762 0.108131 0.193792 0.274269 0.166348 0.320985 0.203812 0.203874 0.219969 0.098657 0.206300 0.194011 0.041557 0.109976 0.047167 0.069413 0.106926 -0.034523 0.007054 -0.025858
URXCNP -0.019836 0.054427 0.016736 0.040792 0.033731 0.017578 0.045956 0.032946 -0.009345 -0.009302 -0.021199 -0.019247 -0.013142 0.034547 0.034075 -0.003502 0.014503 0.055550 -0.027274 -0.060819 0.349095 0.134445 0.173518 0.092791 0.112136 0.172665 0.140762 1.000000 0.618849 0.483670 0.367830 0.635708 0.236778 0.435868 0.359436 0.335661 0.435917 0.441603 0.327628 0.161853 0.178888 0.166450 0.188201 0.239195 -0.011106 -0.024189 0.003673
URXCOP -0.039036 0.046988 0.010988 0.034353 0.019189 0.008009 0.042326 0.064482 -0.000618 -0.000629 -0.039583 -0.077987 -0.012859 0.046197 -0.046931 -0.063741 -0.029907 0.004775 -0.059262 -0.075107 0.267763 0.089091 0.187773 -0.010443 0.120277 0.127276 0.108131 0.618849 1.000000 0.351404 0.260520 0.732770 0.112946 0.313209 0.282293 0.368948 0.669951 0.324965 0.257313 0.080142 0.159030 0.102285 0.116859 0.166852 -0.004938 -0.008356 -0.000488
URXECP -0.072532 0.024656 0.017814 0.054915 0.053835 0.024111 0.054523 0.061831 -0.004863 -0.004815 -0.024228 0.004016 -0.023710 0.039519 0.117080 0.082549 0.075620 0.048265 0.025619 -0.005569 0.463889 0.209084 0.158292 0.219584 0.109904 0.234936 0.193792 0.483670 0.351404 1.000000 0.601894 0.571587 0.409682 0.954633 0.785646 0.427790 0.308156 0.959086 0.522091 0.176638 0.171112 0.184258 0.219289 0.274339 -0.012043 -0.039747 0.005039
URXMBP -0.077049 0.016557 0.030500 -0.010179 -0.007952 0.010608 -0.006636 0.049967 -0.012439 -0.012410 0.008434 0.004545 -0.035459 0.057045 0.094307 0.041606 0.031733 0.017592 0.004139 -0.014625 0.507276 0.170628 0.181523 0.196666 0.172112 0.308348 0.274269 0.367830 0.260520 0.601894 1.000000 0.508607 0.456371 0.631331 0.501003 0.704682 0.221166 0.642819 0.713835 0.228500 0.279663 0.268746 0.306498 0.349468 -0.058330 -0.056617 -0.004023
URXMC1 -0.052441 0.022802 0.031569 0.051504 0.040538 0.017884 0.054428 0.052259 -0.004157 -0.004097 -0.018317 -0.031711 -0.025321 0.055757 0.057998 0.001521 0.036177 0.036828 -0.007652 -0.030603 0.409479 0.133347 0.191800 0.135580 0.130442 0.193652 0.166348 0.635708 0.732770 0.571587 0.508607 1.000000 0.275164 0.556277 0.454681 0.440910 0.615210 0.563545 0.456788 0.183443 0.210298 0.198018 0.221253 0.268346 -0.007257 -0.042238 -0.011307
URXMEP 0.008772 0.072135 0.025856 0.002181 0.006686 0.009579 -0.002662 0.037279 0.004544 0.004545 0.004204 0.022656 -0.006683 0.021110 0.086494 0.047163 0.066441 0.041176 0.035889 -0.002710 0.367465 0.200237 0.113358 0.223052 0.176011 0.352957 0.320985 0.236778 0.112946 0.409682 0.456371 0.275164 1.000000 0.414057 0.323891 0.354573 0.114608 0.413991 0.373875 0.157758 0.200942 0.165065 0.199519 0.222287 -0.033298 -0.043929 0.014999
URXMHH -0.071952 0.023753 0.025792 0.061417 0.051327 0.022351 0.062336 0.062876 -0.004699 -0.004660 -0.020392 0.005668 -0.021957 0.036763 0.110962 0.080600 0.077031 0.053880 0.033574 -0.000166 0.481790 0.194528 0.139980 0.217083 0.117050 0.238077 0.203812 0.435868 0.313209 0.954633 0.631331 0.556277 0.414057 1.000000 0.815825 0.462766 0.301488 0.987170 0.557243 0.190354 0.177058 0.213512 0.241441 0.294033 -0.019916 -0.045528 0.002565
URXMHP -0.057099 0.011157 0.028521 0.045509 0.030530 0.013380 0.048650 0.031826 -0.012199 -0.012172 -0.003666 -0.005111 -0.000913 0.056583 0.059949 0.038000 0.044171 0.032293 0.023473 0.013113 0.400325 0.167909 0.112130 0.180310 0.138134 0.213889 0.203874 0.359436 0.282293 0.785646 0.501003 0.454681 0.323891 0.815825 1.000000 0.398512 0.402269 0.811381 0.427362 0.175129 0.186194 0.227031 0.240318 0.259142 -0.016418 -0.031536 -0.007709
URXMIB -0.078938 0.043477 0.025864 -0.021754 -0.023047 0.001655 -0.014188 0.079954 0.007175 0.007154 -0.018351 -0.036558 -0.027209 0.051446 -0.003407 -0.037051 -0.042825 -0.032771 -0.058854 -0.069707 0.425609 0.105607 0.149969 0.074325 0.154251 0.261624 0.219969 0.335661 0.368948 0.427790 0.704682 0.440910 0.354573 0.462766 0.398512 1.000000 0.222114 0.476698 0.580313 0.178442 0.282471 0.225757 0.255908 0.302743 -0.039326 -0.039532 -0.007790
URXMNP -0.037882 0.000334 0.023444 0.037501 0.016704 0.004944 0.040118 0.040170 -0.002584 -0.002558 -0.017925 -0.044696 -0.008042 0.051441 -0.000381 -0.019794 -0.016515 0.016587 -0.015399 -0.029655 0.185482 0.079227 0.101400 0.043175 0.099813 0.103442 0.098657 0.435917 0.669951 0.308156 0.221166 0.615210 0.114608 0.301488 0.402269 0.222114 1.000000 0.295867 0.176750 0.064845 0.103344 0.095345 0.095047 0.118141 -0.011444 -0.022203 -0.008673
URXMOH -0.084542 0.020893 0.019869 0.054698 0.048178 0.022366 0.059006 0.062996 -0.010806 -0.010765 -0.024708 -0.005132 -0.026370 0.036990 0.109072 0.079758 0.070293 0.045418 0.025236 -0.006340 0.490261 0.195391 0.145904 0.214734 0.110185 0.239427 0.206300 0.441603 0.324965 0.959086 0.642819 0.563545 0.413991 0.987170 0.811381 0.476698 0.295867 1.000000 0.573781 0.188118 0.180969 0.209293 0.239041 0.294115 -0.023223 -0.046657 -0.003231
URXMZP -0.127855 0.018343 0.040268 0.012892 0.014769 0.014281 0.017889 0.063261 -0.041545 -0.041558 -0.021559 -0.033446 -0.064066 0.031205 0.078877 0.014725 -0.005844 0.004436 -0.040253 -0.066336 0.480276 0.115123 0.115187 0.104504 0.091529 0.211237 0.194011 0.327628 0.257313 0.522091 0.713835 0.456788 0.373875 0.557243 0.427362 0.580313 0.176750 0.573781 1.000000 0.185719 0.234520 0.247408 0.273666 0.304638 -0.053932 -0.053804 -0.027208
URXP01 0.089793 0.000241 0.087382 0.029931 0.014994 0.011504 0.007935 -0.047075 0.017032 0.017038 0.283455 0.168714 0.003810 0.092305 0.016038 0.018443 -0.007564 0.014354 -0.005773 -0.007065 0.208404 0.019495 -0.003789 0.031885 0.089756 0.077430 0.041557 0.161853 0.080142 0.176638 0.228500 0.183443 0.157758 0.190354 0.175129 0.178442 0.064845 0.188118 0.185719 1.000000 0.580734 0.739686 0.722313 0.600033 -0.037827 -0.077841 0.021351
URXP02 0.011790 0.096730 0.103168 -0.011944 -0.020790 0.000098 -0.015958 0.045784 0.034159 0.034114 0.170835 0.008869 -0.072821 0.105401 -0.017730 -0.013376 -0.024268 -0.000354 -0.033340 -0.043761 0.233228 0.028982 0.040930 0.020255 0.094462 0.131640 0.109976 0.178888 0.159030 0.171112 0.279663 0.210298 0.200942 0.177058 0.186194 0.282471 0.103344 0.180969 0.234520 0.580734 1.000000 0.654846 0.683528 0.567045 -0.075100 -0.068848 0.023527
URXP03 0.020125 0.000286 0.126131 0.034726 0.011470 0.009283 0.014238 -0.041164 -0.006506 -0.006522 0.287831 0.121121 -0.043994 0.098307 0.001807 0.008766 -0.013178 0.010638 -0.018106 -0.018591 0.246652 0.014788 -0.017669 0.008582 0.085148 0.072077 0.047167 0.166450 0.102285 0.184258 0.268746 0.198018 0.165065 0.213512 0.227031 0.225757 0.095345 0.209293 0.247408 0.739686 0.654846 1.000000 0.957671 0.739636 -0.076111 -0.091917 -0.005565
URXP04 0.034065 0.045689 0.118798 0.028441 0.009867 0.009833 0.009136 -0.006053 0.011443 0.011431 0.265966 0.118584 -0.040516 0.112249 -0.002726 0.000830 -0.011752 0.007438 -0.018499 -0.019319 0.276966 0.032948 0.008097 0.034143 0.093562 0.094411 0.069413 0.188201 0.116859 0.219289 0.306498 0.221253 0.199519 0.241441 0.240318 0.255908 0.095047 0.239041 0.273666 0.722313 0.683528 0.957671 1.000000 0.803507 -0.071718 -0.091878 0.008342
URXP06 0.024007 0.057947 0.079881 0.016261 0.005457 0.010220 0.003620 0.017406 0.003245 0.003207 0.109110 0.074794 -0.017936 0.144847 -0.006888 -0.001308 -0.021259 0.000595 -0.022704 -0.023169 0.292380 0.088000 0.096956 0.083219 0.123945 0.123903 0.106926 0.239195 0.166852 0.274339 0.349468 0.268346 0.222287 0.294033 0.259142 0.302743 0.118141 0.294115 0.304638 0.600033 0.567045 0.739636 0.803507 1.000000 -0.044658 -0.075691 0.005294
DR1TFIBE 0.110284 -0.003748 -0.015875 0.534127 0.550612 0.118924 0.418043 -0.030956 0.032780 0.032748 -0.047857 0.040778 0.073392 -0.005508 -0.022200 -0.025001 -0.012025 -0.002336 0.001055 -0.000267 -0.053165 0.013946 0.015527 -0.013645 -0.012617 -0.020207 -0.034523 -0.011106 -0.004938 -0.012043 -0.058330 -0.007257 -0.033298 -0.019916 -0.016418 -0.039326 -0.011444 -0.023223 -0.053932 -0.037827 -0.075100 -0.076111 -0.071718 -0.044658 1.000000 0.098028 0.012433
DR1_320Z 0.021556 0.024218 -0.043760 -0.043463 -0.064822 -0.048236 -0.011403 -0.027378 0.008299 0.008312 -0.081805 -0.053055 0.081617 0.006512 -0.037555 -0.025179 -0.027074 -0.023900 -0.004412 0.008976 -0.071083 0.025922 0.048153 0.012961 0.012675 0.004155 0.007054 -0.024189 -0.008356 -0.039747 -0.056617 -0.042238 -0.043929 -0.045528 -0.031536 -0.039532 -0.022203 -0.046657 -0.053804 -0.077841 -0.068848 -0.091917 -0.091878 -0.075691 0.098028 1.000000 0.001123
diabetes 0.231925 0.144196 0.020119 -0.027442 -0.030970 -0.011102 -0.023162 0.124081 0.652247 0.652336 0.068836 0.081720 0.029791 0.018170 -0.005594 -0.016019 0.006191 0.005793 0.022107 0.016343 -0.018315 -0.004257 -0.033809 -0.003387 -0.026673 -0.014588 -0.025858 0.003673 -0.000488 0.005039 -0.004023 -0.011307 0.014999 0.002565 -0.007709 -0.007790 -0.008673 -0.003231 -0.027208 0.021351 0.023527 -0.005565 0.008342 0.005294 0.012433 0.001123 1.000000
InĀ [84]:
# chi square tests
chisq_results = {}
categorical_nhanes.remove("diabetes")

for col in categorical_nhanes:
    # Create a contingency table
    contingency_table = pd.crosstab(train_data[col], train_data["diabetes"])

    # Perform the Chi-Square test
    chi2, p, dof, expected = chi2_contingency(contingency_table)

    # Store results
    chisq_results[col] = {"Chi2": chi2, "p-value": p, "Degrees of Freedom": dof}

# Convert results to a DataFrame for better visualization
chisq_results_df = pd.DataFrame(chisq_results).T
print(chisq_results_df)
                Chi2       p-value  Degrees of Freedom
RIAGENDR    7.464367  6.293216e-03                 1.0
RIDRETH1    2.141709  1.433421e-01                 1.0
DMDEDUC2  172.867247  1.749690e-39                 1.0
INDHHIN2    8.582897  3.393351e-03                 1.0
DPQ010      8.269928  4.030711e-03                 1.0
PAQ710      3.186036  7.426990e-02                 1.0
HUQ051      4.112754  4.256100e-02                 1.0
HOQ065      4.539456  3.312211e-02                 1.0
SMQ020     56.818977  4.778330e-14                 1.0
MCQ300C   177.497618  1.705346e-40                 1.0
HIQ011      7.793265  5.244133e-03                 1.0
SLQ050      3.312152  6.876937e-02                 1.0
InĀ [86]:
# pairplots among top 10 variables most correlated with diabetes
diq_corr = corr["diabetes"].abs().sort_values(ascending=False)
print(diq_corr.head(11))
num_corr_diabetes = ["LBDGLUSI", "LBXGLU", "RIDAGEYR", "LBXIN", "BMXBMI", "LBXBPB", "LBXTHG", "LBXBCD", "URXBP3", "DR1TCARB"]

nhanes_numeric_nonan = train_data[num_corr_diabetes].dropna()

sns.pairplot(nhanes_numeric_nonan, diag_kind="hist", plot_kws={'alpha': 0.5, 's': 5})
plt.show()
diabetes    1.000000
LBDGLUSI    0.652336
LBXGLU      0.652247
RIDAGEYR    0.231925
BMXBMI      0.144196
LBXIN       0.124081
LBXBPB      0.081720
LBXBCD      0.068836
URXBP3      0.033809
DR1TCARB    0.030970
LBXTHG      0.029791
Name: diabetes, dtype: float64
No description has been provided for this image
InĀ [88]:
# top 18 variables least correlated with diabetes (corr value under 0.01)
diq_corr_asc = corr["diabetes"].abs().sort_values(ascending=True)
print(diq_corr_asc.head(18))
URXCOP      0.000488
DR1_320Z    0.001123
URXMHH      0.002565
URXMOH      0.003231
URXBUP      0.003387
URXCNP      0.003673
URXMBP      0.004023
URXTRS      0.004257
URXECP      0.005039
URXP06      0.005294
URXP03      0.005565
LBXMPAH     0.005594
LBXPFHS     0.005793
LBXPFNA     0.006191
URXMHP      0.007709
URXMIB      0.007790
URXP04      0.008342
URXMNP      0.008673
Name: diabetes, dtype: float64
InĀ [90]:
# heatmaps
p_values = chisq_results_df["p-value"].astype(float).to_frame()

plt.figure(figsize=(8, 6))
sns.heatmap(p_values.T, annot=True, cmap="coolwarm", linewidths=0.5, fmt=".3f")

plt.title("Chi-Square Test p-values (Association with Diabetes)")
plt.show()
No description has been provided for this image
InĀ [92]:
# drop variables (correlation value <0.01 with diabetes OR p-value >0.05 in chi-square test with diabetes)
train_data = train_data.drop(columns=["DR1_320Z", "LBXMPAH", "LBXPFNA", "LBXPFHS", "URXP03", "URXP04", "URXP06", "URXTRS", 
                                      "URXBUP", "URXCNP", "URXCOP", "URXECP", "URXMBP", "URXMHH", "URXMHP", "URXMIB", 
                                      "URXMNP", "URXMOH", "RIDRETH1", "PAQ710", "SLQ050"])
test_data = test_data.drop(columns=["DR1_320Z", "LBXMPAH", "LBXPFNA", "LBXPFHS", "URXP03", "URXP04", "URXP06", "URXTRS", 
                                      "URXBUP", "URXCNP", "URXCOP", "URXECP", "URXMBP", "URXMHH", "URXMHP", "URXMIB", 
                                      "URXMNP", "URXMOH", "RIDRETH1", "PAQ710", "SLQ050"])

Discussion for Exploratory Data Analysis and Bivariate Analysis¶

LBDGLUSI and LBXGLU, DR1TCARB and DR1TKCAL, DR1TTFAT and DR1TKCAL, DR1TCARB and DR1TSUGR, URXECP and URXMOH, URXMOH and URXMHH are all highly correlated. Their direction of correlation makes sense as they are all measure similar things that have overlap with one another. In terms of correlations with diabetes, the predictors that are highly correlated with the target variable are LBDGLUSI at 0.644912, LBXGLU at 0.644864, and LBXIN at 0.166696. For the variables that I would remove because of how weak they are, I would choose to remove SLQ050, DR1TFIBE, DR1_320Z, URXP01, URXMNP, URXMNP, URXCNP, URXEPB, URXBUP, LBXPFUA, LBXPFHS, and LBXPFNA because their correlation values with the target variable are less than 0.01.

Exporting Train and Test Data¶

InĀ [Ā ]:
train_data.to_csv('/Users/kevinnguyen/Downloads/nhanes_training_data.csv', index=False)
test_data.to_csv('/Users/kevinnguyen/Downloads/nhanes_testing_data.csv', index=False)
InĀ [Ā ]:
train_data_og = train_data.drop(columns=['DPQ010', 'MCQ300C', 'HUQ051', 'HIQ011', 'HOQ065', 'DR1TFIBE'])
test_data_og = test_data.drop(columns=['DPQ010', 'MCQ300C', 'HUQ051', 'HIQ011', 'HOQ065', 'DR1TFIBE'])
train_data_og.to_csv('/Users/kevinnguyen/Downloads/nhanes_training_data_og.csv', index=False)
test_data_og.to_csv('/Users/kevinnguyen/Downloads/nhanes_testing_data_og.csv', index=False)

EDA Figures and Tables for Final Modeling Dataset¶

InĀ [100]:
categorical_nhanes = ["RIAGENDR", "DMDEDUC2", "INDHHIN2", "DPQ010", "HUQ051", "HOQ065", "SMQ020", 
                      "diabetes", "MCQ300C", "HIQ011"]
continuous_nhanes = ["RIDAGEYR", "BMXBMI", "ALQ130", "DR1TKCAL", "DR1TCARB", "DR1TSUGR", "DR1TTFAT", "LBXIN", "LBXGLU", 
                     "LBDGLUSI", "LBXBCD", "LBXBPB", "LBXTHG", "URXUAS", "LBXPFDO", "LBXPFDE", "LBXPFUA", "URXBPH",  
                     "URXBP3", "URXEPB", "URXMPB", "URXPPB", "URXMC1", 
                     "URXMEP", "URXMZP", "URXP01", "URXP02", "DR1TFIBE"]
InĀ [98]:
numerical_nhanes = train_data[continuous_nhanes]
numerical_nhanes["diabetes"] = pd.to_numeric(train_data["diabetes"], errors="coerce")
corr = numerical_nhanes.loc[:,~numerical_nhanes.columns.duplicated()].corr()
corr.head(100)
corr.style.background_gradient(cmap='coolwarm')
Out[98]:
Ā  RIDAGEYR BMXBMI ALQ130 DR1TKCAL DR1TCARB DR1TSUGR DR1TTFAT LBXIN LBXGLU LBDGLUSI LBXBCD LBXBPB LBXTHG URXUAS LBXPFDO LBXPFDE LBXPFUA URXBPH URXBP3 URXEPB URXMPB URXPPB URXMC1 URXMEP URXMZP URXP01 URXP02 DR1TFIBE diabetes
RIDAGEYR 1.000000 0.276835 -0.049195 -0.022535 -0.063251 -0.025177 -0.027491 -0.177180 0.275097 0.275131 0.359195 0.415399 0.280666 0.078363 -0.000235 0.104914 0.114822 -0.052036 -0.007265 0.050070 0.017795 0.002019 -0.052441 0.008772 -0.127855 0.089793 0.011790 0.110284 0.231925
BMXBMI 0.276835 1.000000 0.070655 -0.010689 -0.039629 -0.002268 0.004131 0.465490 0.226896 0.226870 0.024432 0.015792 0.031764 0.016020 -0.028180 -0.034604 -0.050581 0.017345 -0.011590 -0.015703 0.013729 0.004688 0.022802 0.072135 0.018343 0.000241 0.096730 -0.003748 0.144196
ALQ130 -0.049195 0.070655 1.000000 0.122885 0.070473 0.010235 0.062827 0.007606 0.056448 0.056438 0.097814 0.095320 -0.041142 0.020626 -0.013071 -0.025146 -0.034601 0.054806 -0.047695 0.015435 -0.022472 -0.033499 0.031569 0.025856 0.040268 0.087382 0.103168 -0.015875 0.020119
DR1TKCAL -0.022535 -0.010689 0.122885 1.000000 0.877693 0.258928 0.872380 0.000511 0.007269 0.007290 -0.004915 0.034032 0.002365 0.011704 0.002527 0.010015 -0.020095 0.010087 -0.018234 -0.014865 -0.039840 -0.055292 0.051504 0.002181 0.012892 0.029931 -0.011944 0.534127 -0.027442
DR1TCARB -0.063251 -0.039629 0.070473 0.877693 1.000000 0.450036 0.633181 0.029471 -0.000078 -0.000066 -0.033193 0.001565 -0.044464 -0.015182 0.001369 -0.013819 -0.035910 0.011760 -0.025200 -0.038929 -0.037678 -0.052249 0.040538 0.006686 0.014769 0.014994 -0.020790 0.550612 -0.030970
DR1TSUGR -0.025177 -0.002268 0.010235 0.258928 0.450036 1.000000 0.175195 0.025237 0.008435 0.008442 -0.021060 -0.013685 -0.025431 -0.010501 0.006104 -0.006975 -0.014593 0.014659 -0.007062 -0.014748 -0.002526 -0.007367 0.017884 0.009579 0.014281 0.011504 0.000098 0.118924 -0.011102
DR1TTFAT -0.027491 0.004131 0.062827 0.872380 0.633181 0.175195 1.000000 0.019355 0.007106 0.007119 -0.035198 -0.010152 -0.013558 -0.006299 0.000724 0.001049 -0.035104 0.007298 -0.006594 -0.018610 -0.023084 -0.031622 0.054428 -0.002662 0.017889 0.007935 -0.015958 0.418043 -0.023162
LBXIN -0.177180 0.465490 0.007606 0.000511 0.029471 0.025237 0.019355 1.000000 0.211596 0.211561 -0.151057 -0.138398 -0.112682 -0.021270 -0.053862 -0.086738 -0.106293 0.019975 -0.034506 -0.063172 -0.006174 -0.010344 0.052259 0.037279 0.063261 -0.047075 0.045784 -0.030956 0.124081
LBXGLU 0.275097 0.226896 0.056448 0.007269 -0.000078 0.008435 0.007106 0.211596 1.000000 0.999994 0.050378 0.144172 0.033099 0.024445 -0.006374 0.024183 0.002555 -0.036025 -0.068459 -0.060605 -0.046914 -0.066700 -0.004157 0.004544 -0.041545 0.017032 0.034159 0.032780 0.652247
LBDGLUSI 0.275131 0.226870 0.056438 0.007290 -0.000066 0.008442 0.007119 0.211561 0.999994 1.000000 0.050424 0.144230 0.033159 0.024522 -0.006326 0.024244 0.002633 -0.036008 -0.068446 -0.060634 -0.046903 -0.066726 -0.004097 0.004545 -0.041558 0.017038 0.034114 0.032748 0.652336
LBXBCD 0.359195 0.024432 0.097814 -0.004915 -0.033193 -0.021060 -0.035198 -0.151057 0.050378 0.050424 1.000000 0.385347 0.127550 0.056038 0.016960 0.035651 0.059691 -0.004681 -0.035683 0.051736 0.003359 0.000491 -0.018317 0.004204 -0.021559 0.283455 0.170835 -0.047857 0.068836
LBXBPB 0.415399 0.015792 0.095320 0.034032 0.001565 -0.013685 -0.010152 -0.138398 0.144172 0.144230 0.385347 1.000000 0.199550 0.107696 0.092868 0.099836 0.102385 -0.001403 -0.079913 -0.009271 -0.045575 -0.073508 -0.031711 0.022656 -0.033446 0.168714 0.008869 0.040778 0.081720
LBXTHG 0.280666 0.031764 -0.041142 0.002365 -0.044464 -0.025431 -0.013558 -0.112682 0.033099 0.033159 0.127550 0.199550 1.000000 0.300720 0.042309 0.151698 0.228828 -0.040945 0.056446 0.059693 0.024701 0.013835 -0.025321 -0.006683 -0.064066 0.003810 -0.072821 0.073392 0.029791
URXUAS 0.078363 0.016020 0.020626 0.011704 -0.015182 -0.010501 -0.006299 -0.021270 0.024445 0.024522 0.056038 0.107696 0.300720 1.000000 0.031487 0.072972 0.098317 0.036320 0.028407 0.040041 0.040130 0.024067 0.055757 0.021110 0.031205 0.092305 0.105401 -0.005508 0.018170
LBXPFDO -0.000235 -0.028180 -0.013071 0.002527 0.001369 0.006104 0.000724 -0.053862 -0.006374 -0.006326 0.016960 0.092868 0.042309 0.031487 1.000000 0.366735 0.361501 0.025458 -0.015623 -0.011895 0.029106 0.018965 0.001521 0.047163 0.014725 0.018443 -0.013376 -0.025001 -0.016019
LBXPFDE 0.104914 -0.034604 -0.025146 0.010015 -0.013819 -0.006975 0.001049 -0.086738 0.024183 0.024244 0.035651 0.099836 0.151698 0.072972 0.366735 1.000000 0.754365 -0.007550 0.007670 0.016857 0.027077 0.002849 -0.007652 0.035889 -0.040253 -0.005773 -0.033340 0.001055 0.022107
LBXPFUA 0.114822 -0.050581 -0.034601 -0.020095 -0.035910 -0.014593 -0.035104 -0.106293 0.002555 0.002633 0.059691 0.102385 0.228828 0.098317 0.361501 0.754365 1.000000 -0.041593 0.000575 0.033351 0.036968 0.011258 -0.030603 -0.002710 -0.066336 -0.007065 -0.043761 -0.000267 0.016343
URXBPH -0.052036 0.017345 0.054806 0.010087 0.011760 0.014659 0.007298 0.019975 -0.036025 -0.036008 -0.004681 -0.001403 -0.040945 0.036320 0.025458 -0.007550 -0.041593 1.000000 0.124354 0.140161 0.227003 0.198849 0.409479 0.367465 0.480276 0.208404 0.233228 -0.053165 -0.018315
URXBP3 -0.007265 -0.011590 -0.047695 -0.018234 -0.025200 -0.007062 -0.006594 -0.034506 -0.068459 -0.068446 -0.035683 -0.079913 0.056446 0.028407 -0.015623 0.007670 0.000575 0.124354 1.000000 0.259654 0.277838 0.298543 0.191800 0.113358 0.115187 -0.003789 0.040930 0.015527 -0.033809
URXEPB 0.050070 -0.015703 0.015435 -0.014865 -0.038929 -0.014748 -0.018610 -0.063172 -0.060605 -0.060634 0.051736 -0.009271 0.059693 0.040041 -0.011895 0.016857 0.033351 0.140161 0.259654 1.000000 0.487131 0.481681 0.130442 0.176011 0.091529 0.089756 0.094462 -0.012617 -0.026673
URXMPB 0.017795 0.013729 -0.022472 -0.039840 -0.037678 -0.002526 -0.023084 -0.006174 -0.046914 -0.046903 0.003359 -0.045575 0.024701 0.040130 0.029106 0.027077 0.036968 0.227003 0.277838 0.487131 1.000000 0.827147 0.193652 0.352957 0.211237 0.077430 0.131640 -0.020207 -0.014588
URXPPB 0.002019 0.004688 -0.033499 -0.055292 -0.052249 -0.007367 -0.031622 -0.010344 -0.066700 -0.066726 0.000491 -0.073508 0.013835 0.024067 0.018965 0.002849 0.011258 0.198849 0.298543 0.481681 0.827147 1.000000 0.166348 0.320985 0.194011 0.041557 0.109976 -0.034523 -0.025858
URXMC1 -0.052441 0.022802 0.031569 0.051504 0.040538 0.017884 0.054428 0.052259 -0.004157 -0.004097 -0.018317 -0.031711 -0.025321 0.055757 0.001521 -0.007652 -0.030603 0.409479 0.191800 0.130442 0.193652 0.166348 1.000000 0.275164 0.456788 0.183443 0.210298 -0.007257 -0.011307
URXMEP 0.008772 0.072135 0.025856 0.002181 0.006686 0.009579 -0.002662 0.037279 0.004544 0.004545 0.004204 0.022656 -0.006683 0.021110 0.047163 0.035889 -0.002710 0.367465 0.113358 0.176011 0.352957 0.320985 0.275164 1.000000 0.373875 0.157758 0.200942 -0.033298 0.014999
URXMZP -0.127855 0.018343 0.040268 0.012892 0.014769 0.014281 0.017889 0.063261 -0.041545 -0.041558 -0.021559 -0.033446 -0.064066 0.031205 0.014725 -0.040253 -0.066336 0.480276 0.115187 0.091529 0.211237 0.194011 0.456788 0.373875 1.000000 0.185719 0.234520 -0.053932 -0.027208
URXP01 0.089793 0.000241 0.087382 0.029931 0.014994 0.011504 0.007935 -0.047075 0.017032 0.017038 0.283455 0.168714 0.003810 0.092305 0.018443 -0.005773 -0.007065 0.208404 -0.003789 0.089756 0.077430 0.041557 0.183443 0.157758 0.185719 1.000000 0.580734 -0.037827 0.021351
URXP02 0.011790 0.096730 0.103168 -0.011944 -0.020790 0.000098 -0.015958 0.045784 0.034159 0.034114 0.170835 0.008869 -0.072821 0.105401 -0.013376 -0.033340 -0.043761 0.233228 0.040930 0.094462 0.131640 0.109976 0.210298 0.200942 0.234520 0.580734 1.000000 -0.075100 0.023527
DR1TFIBE 0.110284 -0.003748 -0.015875 0.534127 0.550612 0.118924 0.418043 -0.030956 0.032780 0.032748 -0.047857 0.040778 0.073392 -0.005508 -0.025001 0.001055 -0.000267 -0.053165 0.015527 -0.012617 -0.020207 -0.034523 -0.007257 -0.033298 -0.053932 -0.037827 -0.075100 1.000000 0.012433
diabetes 0.231925 0.144196 0.020119 -0.027442 -0.030970 -0.011102 -0.023162 0.124081 0.652247 0.652336 0.068836 0.081720 0.029791 0.018170 -0.016019 0.022107 0.016343 -0.018315 -0.033809 -0.026673 -0.014588 -0.025858 -0.011307 0.014999 -0.027208 0.021351 0.023527 0.012433 1.000000
InĀ [102]:
chisq_results = {}
categorical_nhanes.remove("diabetes")

for col in categorical_nhanes:
    # Create a contingency table
    contingency_table = pd.crosstab(train_data[col], train_data["diabetes"])

    # Perform the Chi-Square test
    chi2, p, dof, expected = chi2_contingency(contingency_table)

    # Store results
    chisq_results[col] = {"Chi2": chi2, "p-value": p, "Degrees of Freedom": dof}

# Convert results to a DataFrame for better visualization
chisq_results_df = pd.DataFrame(chisq_results).T
print(chisq_results_df)
                Chi2       p-value  Degrees of Freedom
RIAGENDR    7.464367  6.293216e-03                 1.0
DMDEDUC2  172.867247  1.749690e-39                 1.0
INDHHIN2    8.582897  3.393351e-03                 1.0
DPQ010      8.269928  4.030711e-03                 1.0
HUQ051      4.112754  4.256100e-02                 1.0
HOQ065      4.539456  3.312211e-02                 1.0
SMQ020     56.818977  4.778330e-14                 1.0
MCQ300C   177.497618  1.705346e-40                 1.0
HIQ011      7.793265  5.244133e-03                 1.0
InĀ [104]:
diq_corr = corr["diabetes"].abs().sort_values(ascending=False)
print(diq_corr.head(11))
num_corr_diabetes = ["LBDGLUSI", "LBXGLU", "RIDAGEYR", "LBXIN", "BMXBMI", "LBXBPB", "LBXTHG", "LBXBCD", "URXBP3", "DR1TCARB"]

nhanes_numeric_nonan = train_data[num_corr_diabetes].dropna()

sns.pairplot(nhanes_numeric_nonan, diag_kind="hist", plot_kws={'alpha': 0.5, 's': 5})
plt.show()
diabetes    1.000000
LBDGLUSI    0.652336
LBXGLU      0.652247
RIDAGEYR    0.231925
BMXBMI      0.144196
LBXIN       0.124081
LBXBPB      0.081720
LBXBCD      0.068836
URXBP3      0.033809
DR1TCARB    0.030970
LBXTHG      0.029791
Name: diabetes, dtype: float64
No description has been provided for this image
InĀ [106]:
p_values = chisq_results_df["p-value"].astype(float).to_frame()

plt.figure(figsize=(8, 6))
sns.heatmap(p_values.T, annot=True, cmap="coolwarm", linewidths=0.5, fmt=".3f")

plt.title("Chi-Square Test p-values (Association with Diabetes)")
plt.show()
No description has been provided for this image